leslie-fang-intel/after_regression.log

## after_regression.log

loading model: 0it [00:00, ?it/s]Input ids are automatically padded from 819 to 832 to be a multiple of `config.block_size`: 64

loading model: 0it [00:02, ?it/s]
cpu  eval  hf_BigBird
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1] TRACED GRAPH
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]  ===== FROZEN GRAPH =====
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]     def forward(self):
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]         return ()
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] TRACED GRAPH
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]  ===== FROZEN GRAPH =====
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]     def forward(self, arg0_1: "i64[1, 4096]"):
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         full_default: "f32[1, 819]" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         slice_2: "i64[1, 819]" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819);  arg0_1 = None
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         expand: "i64[1, 819]" = torch.ops.aten.expand.default(slice_2, [1, 819]);  slice_2 = None
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]         return (full_default, expand)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
I0614 00:48:29.255000 140616046391680 torch/_dynamo/logging.py:56] [2/0_1] Step 3: torchinductor compiling FORWARDS graph 1
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]  ===== AFTER POST GRAD =====
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]     def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         full_default: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819);  arg0_1 = None
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]);  slice_2 = None
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]         return (full_default, expand)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.364000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:29.365000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([1, 819], 1), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu, pin_memory: False})
V0614 00:48:29.365000 140616046391680 torch/_inductor/graph.py:976] [2/0_1]   via <function full at 0x7fe21799c550>
V0614 00:48:29.366000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg0_1, 1, 0, 819), kwargs = {})
V0614 00:48:29.366000 140616046391680 torch/_inductor/graph.py:976] [2/0_1]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %expand : [num_users=1] = call_function[target=torch.ops.aten.expand.default](args = (%slice_2, [1, 819]), kwargs = {})
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:976] [2/0_1]   via <function expand at 0x7fe2179c2cb0>
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering return (full_default, expand)
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:1097] [2/0_1] Force channels last inputs for 0 conv for the current graph with id 1
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.float32, size=[1, 819], stride=[819, 1]), data=Pointwise(
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   'cpu',
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   torch.float32,
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   def inner_fn(index):
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]       _, i1 = index
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]       return tmp0
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   ,
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   ranges=[1, 819],
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   origin_node=full_default,
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1]   origins={full_default}
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] ))
V0614 00:48:29.394000 140616046391680 torch/_inductor/scheduler.py:1671] [2/0_1] scheduling output buf0
V0614 00:48:29.394000 140616046391680 torch/_inductor/scheduler.py:1671] [2/0_1] scheduling output arg0_1
V0614 00:48:29.395000 140616046391680 torch/_inductor/scheduler.py:2688] [2/0_1] Generating code for node buf0 with estimated runtime 0.000000
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] get_bounds:
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] graph():
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1]     %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1]     %constant : [num_users=1] = call_method[target=constant](args = (%ops, 1.0, torch.float32), kwargs = {})
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index, %constant, None), kwargs = {})
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1]     return store
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] Output code:
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] # AOT ID: ['1_inference']
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import torch
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import math
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import random
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import os
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import tempfile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from math import inf, nan
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch import device, empty_strided
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] aten = torch.ops.aten
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] async_compile = AsyncCompile()
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] cpp_fused_ones_0 = async_compile.cpp_pybinding(['float*'], '''
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] extern "C" void kernel(float* out_ptr0)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         for(long x0=static_cast<long>(0L); x0<static_cast<long>(816L); x0+=static_cast<long>(16L))
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]             auto tmp0 = static_cast<float>(1.0);
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]             auto tmp1 = at::vec::Vectorized<float>(tmp0);
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]             tmp1.store(out_ptr0 + static_cast<long>(x0));
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         #pragma omp simd simdlen(8)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         for(long x0=static_cast<long>(816L); x0<static_cast<long>(819L); x0+=static_cast<long>(1L))
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]             auto tmp0 = static_cast<float>(1.0);
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]             out_ptr0[static_cast<long>(x0)] = tmp0;
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]         }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] ''')
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] async_compile.wait(globals())
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] del async_compile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] def call(args):
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     arg0_1, = args
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     args.clear()
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     assert_size_stride(arg0_1, (1, 4096), (4096, 1))
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     buf0 = empty_strided_cpu((1, 819), (819, 1), torch.float32)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     cpp_fused_ones_0(buf0)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     return (buf0, reinterpret_tensor(arg0_1, (1, 819), (4096, 1), 0), )
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     from torch._dynamo.testing import rand_strided
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     from torch._inductor.utils import print_performance
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     arg0_1 = rand_strided((1, 4096), (4096, 1), device='cpu', dtype=torch.int64)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     fn = lambda: call([arg0_1])
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] if __name__ == "__main__":
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]     compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:32.441000 140616046391680 torch/_inductor/graph.py:1714] [2/0_1] Output code written to: /tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py
I0614 00:48:32.441000 140616046391680 torch/_inductor/graph.py:1715] [2/0_1] [__output_code] Output code written to: /tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py
V0614 00:48:32.443000 140616046391680 torch/_inductor/compile_fx.py:531] [2/0_1] FX codegen and compilation took 3.188s
I0614 00:48:32.443000 140616046391680 torch/_dynamo/logging.py:56] [2/0_1] Step 3: torchinductor done compiling FORWARDS graph 1
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] TRACED GRAPH
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]  ===== FROZEN GRAPH =====
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]     def forward(self, arg0_1: "i64[1, 819]", arg1_1: "f32[1, 819]", arg2_1: "i64[1, 819]"):
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         constant_pad_nd: "i64[1, 832]" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0);  arg0_1 = None
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         constant_pad_nd_1: "f32[1, 832]" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0);  arg1_1 = None
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         constant_pad_nd_2: "i64[1, 832]" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0);  arg2_1 = None
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]         return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
I0614 00:48:32.508000 140616046391680 torch/_dynamo/logging.py:56] [4/0] Step 3: torchinductor compiling FORWARDS graph 2
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]  ===== AFTER POST GRAD =====
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]     def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0);  arg0_1 = None
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0);  arg1_1 = None
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0);  arg2_1 = None
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]         return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.511000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:32.511000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
V0614 00:48:32.512000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
V0614 00:48:32.512000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %constant_pad_nd : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%arg0_1, [0, 13], 0.0), kwargs = {})
V0614 00:48:32.512000 140616046391680 torch/_inductor/graph.py:976] [4/0]   via <function constant_pad_nd at 0x7fe21799e7a0>
V0614 00:48:32.513000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %constant_pad_nd_1 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%arg1_1, [0, 13], 0.0), kwargs = {})
V0614 00:48:32.513000 140616046391680 torch/_inductor/graph.py:976] [4/0]   via <function constant_pad_nd at 0x7fe21799e7a0>
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %constant_pad_nd_2 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%arg2_1, [0, 13], 0.0), kwargs = {})
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:976] [4/0]   via <function constant_pad_nd at 0x7fe21799e7a0>
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:1097] [4/0] Force channels last inputs for 0 conv for the current graph with id 2
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.int64, size=[1, 832], stride=[832, 1]), data=Pointwise(
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   'cpu',
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   torch.int64,
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   def inner_fn(index):
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       _, i1 = index
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp0 = ops.index_expr(i1, torch.int64)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp1 = ops.index_expr(819, torch.int64)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp2 = tmp0 < tmp1
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp3 = ops.load(arg0_1, i1)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp4 = ops.masked(tmp2, tmp3, 0)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       return tmp4
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   ,
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   ranges=[1, 832],
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   origin_node=constant_pad_nd,
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   origins={constant_pad_nd}
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ))
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] scheduling ComputedBuffer(name='buf1', layout=FixedLayout('cpu', torch.float32, size=[1, 832], stride=[832, 1]), data=Pointwise(
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   'cpu',
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   torch.float32,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   def inner_fn(index):
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       _, i1 = index
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp0 = ops.index_expr(i1, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp1 = ops.index_expr(819, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp2 = tmp0 < tmp1
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp3 = ops.load(arg1_1, i1)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp4 = ops.masked(tmp2, tmp3, 0.0)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       return tmp4
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   ,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   ranges=[1, 832],
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   origin_node=constant_pad_nd_1,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   origins={constant_pad_nd_1}
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ))
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] scheduling ComputedBuffer(name='buf2', layout=FixedLayout('cpu', torch.int64, size=[1, 832], stride=[832, 1]), data=Pointwise(
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   'cpu',
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   torch.int64,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   def inner_fn(index):
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       _, i1 = index
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp0 = ops.index_expr(i1, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp1 = ops.index_expr(819, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp2 = tmp0 < tmp1
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp3 = ops.load(arg2_1, i1)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       tmp4 = ops.masked(tmp2, tmp3, 0)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]       return tmp4
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   ,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   ranges=[1, 832],
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   origin_node=constant_pad_nd_2,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0]   origins={constant_pad_nd_2}
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ))
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1671] [4/0] scheduling output buf0
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1671] [4/0] scheduling output buf1
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1671] [4/0] scheduling output buf2
V0614 00:48:32.528000 140616046391680 torch/_inductor/scheduler.py:2688] [4/0] Generating code for node buf0 with estimated runtime 0.000000
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] get_bounds:
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] graph():
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %index_expr : [num_users=1] = call_method[target=index_expr](args = (%ops, %get_index, torch.int64), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %constant : [num_users=1] = call_method[target=constant](args = (%ops, 819, torch.int64), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %lt : [num_users=1] = call_method[target=lt](args = (%ops, %index_expr, %constant), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %masked_subblock1 : [num_users=1] = call_module[target=masked_subblock1](args = (%lt, 0), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index_1, %masked_subblock1, None), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     return store
V0614 00:48:32.533000 140616046391680 torch/_inductor/scheduler.py:2688] [4/0] Generating code for node buf1 with estimated runtime 0.000000
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] get_bounds:
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] graph():
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %index_expr : [num_users=1] = call_method[target=index_expr](args = (%ops, %get_index, torch.int64), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %constant : [num_users=1] = call_method[target=constant](args = (%ops, 819, torch.int64), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %lt : [num_users=1] = call_method[target=lt](args = (%ops, %index_expr, %constant), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %masked_subblock1 : [num_users=1] = call_module[target=masked_subblock1](args = (%lt, 0.0), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf1, %get_index_1, %masked_subblock1, None), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     return store
V0614 00:48:32.537000 140616046391680 torch/_inductor/scheduler.py:2688] [4/0] Generating code for node buf2 with estimated runtime 0.000000
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] get_bounds:
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] graph():
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %index_expr : [num_users=1] = call_method[target=index_expr](args = (%ops, %get_index, torch.int64), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %constant : [num_users=1] = call_method[target=constant](args = (%ops, 819, torch.int64), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %lt : [num_users=1] = call_method[target=lt](args = (%ops, %index_expr, %constant), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %masked_subblock1 : [num_users=1] = call_module[target=masked_subblock1](args = (%lt, 0), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf2, %get_index_1, %masked_subblock1, None), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0]     return store
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] Output code:
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] # AOT ID: ['2_inference']
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import torch
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import math
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import random
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import os
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import tempfile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from math import inf, nan
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch import device, empty_strided
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] aten = torch.ops.aten
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] async_compile = AsyncCompile()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] cpp_fused_constant_pad_nd_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'int64_t*', 'float*', 'int64_t*'], '''
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] extern "C" void kernel(const int64_t* in_ptr0,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                        const float* in_ptr1,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                        const int64_t* in_ptr2,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                        int64_t* out_ptr0,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                        float* out_ptr1,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                        int64_t* out_ptr2)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp0 = x0;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp1 = c10::convert<int32_t>(tmp0);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp3 = static_cast<int32_t>(819);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp6 = [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr0 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 return tmp7;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp10 =
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 if (tmp5.all_zero())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 else
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     auto tmp8 = tmp6();
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             tmp10.store(out_ptr0 + static_cast<long>(x0), 16);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp0 = x0;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp1 = c10::convert<int32_t>(tmp0);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp3 = static_cast<int32_t>(819);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp6 = [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 auto tmp7 = tmp5.template cast<float,1>().template loadu<float,1>(in_ptr1 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 return tmp7;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp10 =
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 if (tmp5.all_zero())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     return at::vec::Vectorized<float>(static_cast<float>(0.0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 else
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     auto tmp8 = tmp6();
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(0.0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<float,1>());
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             tmp10.store(out_ptr1 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp0 = x0;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp1 = c10::convert<int32_t>(tmp0);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp3 = static_cast<int32_t>(819);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp6 = [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr2 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 return tmp7;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             auto tmp10 =
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 if (tmp5.all_zero())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 else
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     auto tmp8 = tmp6();
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                     return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]                 }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]             tmp10.store(out_ptr2 + static_cast<long>(x0), 16);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]         }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ''')
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] async_compile.wait(globals())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] del async_compile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] def call(args):
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     arg0_1, arg1_1, arg2_1 = args
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     args.clear()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     assert_size_stride(arg0_1, (1, 819), (819, 1))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     assert_size_stride(arg1_1, (1, 819), (819, 1))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     assert_size_stride(arg2_1, (1, 819), (4096, 1))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     buf0 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     buf1 = empty_strided_cpu((1, 832), (832, 1), torch.float32)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     buf2 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     cpp_fused_constant_pad_nd_0(arg0_1, arg1_1, arg2_1, buf0, buf1, buf2)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     del arg0_1
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     del arg1_1
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     del arg2_1
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     return (buf0, buf1, buf2, )
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     from torch._dynamo.testing import rand_strided
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     from torch._inductor.utils import print_performance
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     arg0_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     arg1_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.float32)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     arg2_1 = rand_strided((1, 819), (4096, 1), device='cpu', dtype=torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1])
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] if __name__ == "__main__":
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]     compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:33.603000 140616046391680 torch/_inductor/graph.py:1714] [4/0] Output code written to: /tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py
I0614 00:48:33.603000 140616046391680 torch/_inductor/graph.py:1715] [4/0] [__output_code] Output code written to: /tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py
V0614 00:48:33.604000 140616046391680 torch/_inductor/compile_fx.py:531] [4/0] FX codegen and compilation took 1.097s
I0614 00:48:33.604000 140616046391680 torch/_dynamo/logging.py:56] [4/0] Step 3: torchinductor done compiling FORWARDS graph 2
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] TRACED GRAPH
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]  ===== FROZEN GRAPH =====
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]     def forward(self, arg6_1: "i64[1, 832]", arg7_1: "f32[1, 832]", arg8_1: "i64[1, 832]"):
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # No stacktrace found for following nodes
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         _frozen_param0: "f32[50358, 768]" = self._frozen_param0
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         _frozen_param1: "f32[2, 768]" = self._frozen_param1
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         _frozen_param3: "f32[768]" = self._frozen_param3
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         _frozen_param4: "f32[768]" = self._frozen_param4
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         view: "f32[1, 13, 64]" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         slice_2: "f32[1, 9, 64]" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         slice_4: "f32[1, 9, 64]" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         slice_6: "f32[1, 9, 64]" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         cat: "f32[1, 9, 192]" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2);  slice_2 = slice_6 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         unsqueeze: "f32[1, 9, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_4, 3);  slice_4 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         permute: "f32[1, 9, 64, 1]" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]);  unsqueeze = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         unsqueeze_1: "f32[1, 9, 192, 1]" = torch.ops.aten.unsqueeze.default(cat, 3);  cat = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         permute_1: "f32[1, 9, 1, 192]" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]);  unsqueeze_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         mul: "f32[1, 9, 64, 192]" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         unsqueeze_2: "f32[1, 1, 9, 64, 192]" = torch.ops.aten.unsqueeze.default(mul, 1);  mul = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         view_1: "f32[1, 1, 832, 1]" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         view_2: "f32[1, 1, 1, 832]" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]);  arg7_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         embedding: "f32[1, 832, 768]" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0);  _frozen_param0 = arg6_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         embedding_1: "f32[1, 832, 768]" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1);  _frozen_param1 = arg8_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         add: "f32[1, 832, 768]" = torch.ops.aten.add.Tensor(embedding, embedding_1);  embedding = embedding_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         _frozen_param6: "f32[1, 832, 768]" = self._frozen_param6
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         add_1: "f32[1, 832, 768]" = torch.ops.aten.add.Tensor(add, _frozen_param6);  add = _frozen_param6 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         getitem: "f32[1, 832, 1]" = var_mean[0]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         getitem_1: "f32[1, 832, 1]" = var_mean[1];  var_mean = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         add_2: "f32[1, 832, 1]" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         rsqrt: "f32[1, 832, 1]" = torch.ops.aten.rsqrt.default(add_2);  add_2 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         sub: "f32[1, 832, 768]" = torch.ops.aten.sub.Tensor(add_1, getitem_1);  add_1 = getitem_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         mul_1: "f32[1, 832, 768]" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         mul_2: "f32[1, 832, 768]" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3);  mul_1 = _frozen_param3 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         add_3: "f32[1, 832, 768]" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4);  mul_2 = _frozen_param4 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]         return (add_3, unsqueeze_2, view_1, view_2, view)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
I0614 00:48:33.839000 140616046391680 torch/_dynamo/logging.py:56] [5/0_1] Step 3: torchinductor compiling FORWARDS graph 3
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]  ===== AFTER POST GRAD =====
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]     def forward(self, arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # No stacktrace found for following nodes
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         _frozen_param0: "f32[50358, 768][768, 1]cpu" = self._frozen_param0
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         _frozen_param1: "f32[2, 768][768, 1]cpu" = self._frozen_param1
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         _frozen_param3: "f32[768][1]cpu" = self._frozen_param3
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         _frozen_param4: "f32[768][1]cpu" = self._frozen_param4
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         _frozen_param6: "f32[1, 832, 768][638976, 768, 1]cpu" = self._frozen_param6
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0);  _frozen_param0 = arg6_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1);  _frozen_param1 = arg8_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1);  embedding = embedding_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, _frozen_param6);  add = _frozen_param6 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1];  var_mean = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_1, getitem_1);  add_1 = getitem_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12);  getitem = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2);  add_2 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = rsqrt = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3);  mul_1 = _frozen_param3 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4);  mul_2 = _frozen_param4 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_4, 3)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]);  unsqueeze = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2);  slice_2 = slice_4 = slice_6 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3);  cat = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]);  unsqueeze_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1);  mul = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]);  arg7_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]         return (add_3, unsqueeze_2, view_1, view_2, view)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.858000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param0 : [num_users=1] = get_attr[target=_frozen_param0]
V0614 00:48:33.859000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param1 : [num_users=1] = get_attr[target=_frozen_param1]
V0614 00:48:33.860000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param3 : [num_users=1] = get_attr[target=_frozen_param3]
V0614 00:48:33.860000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param4 : [num_users=1] = get_attr[target=_frozen_param4]
V0614 00:48:33.860000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:33.861000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %arg7_1 : [num_users=3] = placeholder[target=arg7_1]
V0614 00:48:33.861000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %arg8_1 : [num_users=1] = placeholder[target=arg8_1]
V0614 00:48:33.861000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param6 : [num_users=1] = get_attr[target=_frozen_param6]
V0614 00:48:33.862000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %embedding : [num_users=1] = call_function[target=torch.ops.aten.embedding.default](args = (%_frozen_param0, %arg6_1, 0), kwargs = {})
V0614 00:48:33.862000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function embedding at 0x7fe21799c790>
V0614 00:48:33.863000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %embedding_1 : [num_users=1] = call_function[target=torch.ops.aten.embedding.default](args = (%_frozen_param1, %arg8_1), kwargs = {})
V0614 00:48:33.863000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function embedding at 0x7fe21799c790>
V0614 00:48:33.864000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%embedding, %embedding_1), kwargs = {})
V0614 00:48:33.864000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.866000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add_1 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add, %_frozen_param6), kwargs = {})
V0614 00:48:33.866000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.868000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%add_1, [2]), kwargs = {correction: 0, keepdim: True})
V0614 00:48:33.868000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function var_mean at 0x7fe216564820>
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%var_mean, 0), kwargs = {})
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%var_mean, 1), kwargs = {})
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_1, %getitem_1), kwargs = {})
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:33.873000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-12), kwargs = {})
V0614 00:48:33.873000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.874000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %rsqrt : [num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {})
V0614 00:48:33.874000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function rsqrt at 0x7fe216565900>
V0614 00:48:33.875000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
V0614 00:48:33.875000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function mul at 0x7fe2165653f0>
V0614 00:48:33.877000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_1, %_frozen_param3), kwargs = {})
V0614 00:48:33.877000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function mul at 0x7fe2165653f0>
V0614 00:48:33.879000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add_3 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_2, %_frozen_param4), kwargs = {})
V0614 00:48:33.879000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.881000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %view : [num_users=4] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 13, 64]), kwargs = {})
V0614 00:48:33.881000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function view at 0x7fe2179c3130>
V0614 00:48:33.882000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %slice_4 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view, 1, 2, -2), kwargs = {})
V0614 00:48:33.882000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:33.883000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_4, 3), kwargs = {})
V0614 00:48:33.883000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:33.884000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze, [0, 1, 2, 3]), kwargs = {})
V0614 00:48:33.884000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function permute at 0x7fe2179c3370>
V0614 00:48:33.884000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view, 1, 1, -3), kwargs = {})
V0614 00:48:33.885000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:33.886000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %slice_6 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view, 1, 3, -1), kwargs = {})
V0614 00:48:33.886000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:33.887000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_2, %slice_4, %slice_6], 2), kwargs = {})
V0614 00:48:33.887000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:33.888000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %unsqueeze_1 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%cat, 3), kwargs = {})
V0614 00:48:33.888000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_1, [0, 1, 3, 2]), kwargs = {})
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function permute at 0x7fe2179c3370>
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%permute, %permute_1), kwargs = {})
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function mul at 0x7fe2165653f0>
V0614 00:48:33.892000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %unsqueeze_2 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%mul, 1), kwargs = {})
V0614 00:48:33.892000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:33.894000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %view_1 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 1, 832, 1]), kwargs = {})
V0614 00:48:33.894000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function view at 0x7fe2179c3130>
V0614 00:48:33.895000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %view_2 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 1, 1, 832]), kwargs = {})
V0614 00:48:33.895000 140616046391680 torch/_inductor/graph.py:976] [5/0_1]   via <function view at 0x7fe2179c3130>
V0614 00:48:33.895000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering return (add_3, unsqueeze_2, view_1, view_2, view)
V0614 00:48:33.898000 140616046391680 torch/_inductor/graph.py:1097] [5/0_1] Force channels last inputs for 0 conv for the current graph with id 3
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg6_1, i1)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp1 = ops.load(_frozen_param0, i2 + 768 * tmp0)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp2 = ops.load(arg8_1, i1)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp3 = ops.load(_frozen_param1, i2 + 768 * tmp2)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp4 = tmp1 + tmp3
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp5 = ops.load(_frozen_param6, i2 + 768 * i1)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp6 = tmp4 + tmp5
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp6
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 832, 768],
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=add_1,
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={embedding, add, embedding_1, add_1}
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf1', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 1], stride=[832, 1, 832]), data=WelfordReduction(
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index, rindex):
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, _ = index
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       r0 = rindex
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(buf0, r0 + 768 * i1)
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 832, 1],
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   reduction_ranges=[768],
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   reduction_type=welford_reduce,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=getitem_1,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={var_mean}
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf2', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 1], stride=[832, 1, 832]), data=WelfordReduction(
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index, rindex):
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, _ = index
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       r0 = rindex
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(buf0, r0 + 768 * i1)
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 832, 1],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   reduction_ranges=[768],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   reduction_type=welford_reduce,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={var_mean}
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf3', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 1], stride=[832, 1, 832]), data=WelfordReduction(
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index, rindex):
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, _ = index
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       r0 = rindex
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(buf0, r0 + 768 * i1)
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 832, 1],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   reduction_ranges=[768],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   reduction_type=welford_reduce,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={var_mean}
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf4', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(buf0, i2 + 768 * i1)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp1 = ops.load(buf1, i1)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp2 = tmp0 - tmp1
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp3 = ops.load(buf2, i1)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp4 = ops.constant(0, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp5 = ops.constant(768, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp6 = ops.constant(0, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp7 = tmp5 - tmp4
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp8 = ops.maximum(tmp6, tmp7)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp9 = tmp3 / tmp8
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp10 = ops.constant(1e-12, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp11 = tmp9 + tmp10
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp12 = ops.rsqrt(tmp11)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp13 = tmp2 * tmp12
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp14 = ops.load(_frozen_param3, i2)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp15 = tmp13 * tmp14
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp16 = ops.load(_frozen_param4, i2)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp17 = tmp15 + tmp16
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp17
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 832, 768],
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=add_3,
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={mul_1, mul_2, rsqrt, add_2, var_mean, sub, add_3}
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf5', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 64 + i2 + 64 * i1)
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64],
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={cat}
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 128 + i2 + 64 * i1)
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64],
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={cat}
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 192 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ConcatKernel(name='buf8', layout=FixedLayout('cpu', torch.float32, size=[1, 9, 192], stride=[1728, 192, 1]), inputs=[ComputedBuffer(name='buf5', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 64 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] )), ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 128 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] )), ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 192 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))])
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf9', layout=FixedLayout('cpu', torch.float32, size=[1, 9, 64, 192], stride=[110592, 12288, 192, 1]), data=Pointwise(
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   'cpu',
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   torch.float32,
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   def inner_fn(index):
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       _, i1, i2, i3 = index
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp0 = ops.load(arg7_1, 128 + i2 + 64 * i1)
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp1 = ops.load(buf8, i3 + 192 * i1)
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       tmp2 = tmp0 * tmp1
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]       return tmp2
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ,
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   ranges=[1, 9, 64, 192],
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origin_node=mul,
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1]   origins={mul}
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output buf4
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output buf9
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output arg7_1
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output arg7_1
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output arg7_1
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1764] [5/0_1] removed dead node: buf3
V0614 00:48:33.962000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf0_buf1_buf2_buf4 with estimated runtime 0.000000
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=8] = placeholder[target=ops]
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, arg6_1, %get_index), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %set_indirect0 : [num_users=0] = call_module[target=set_indirect0](args = (%load,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_1 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param0, %get_index_1), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_2 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_2 : [num_users=1] = call_method[target=load](args = (%ops, arg8_1, %get_index_2), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %set_indirect1 : [num_users=0] = call_module[target=set_indirect1](args = (%load_2,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_3 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_3 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param1, %get_index_3), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %add : [num_users=1] = call_method[target=add](args = (%ops, %load_1, %load_3), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_4 : [num_users=1] = call_module[target=get_index](args = (index3,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_4 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param6, %get_index_4), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %add_1 : [num_users=1] = call_method[target=add](args = (%ops, %add, %load_4), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_5 : [num_users=1] = call_module[target=get_index](args = (index3,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index_5, %add_1, None), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=3] = placeholder[target=ops]
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, buf0, %get_index), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %reduction : [num_users=3] = call_method[target=reduction](args = (%ops, torch.float32, torch.float32, welford_reduce, %load), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%reduction, 0), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %getitem_1 : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 1), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %getitem_2 : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 2), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store_reduction : [num_users=1] = call_method[target=store_reduction](args = (%ops, buf1, %get_index_1, %getitem), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store_reduction
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=3] = placeholder[target=ops]
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, buf0, %get_index), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %reduction : [num_users=3] = call_method[target=reduction](args = (%ops, torch.float32, torch.float32, welford_reduce, %load), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %getitem : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 0), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%reduction, 1), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %getitem_2 : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 2), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store_reduction : [num_users=1] = call_method[target=store_reduction](args = (%ops, buf2, %get_index_1, %getitem_1), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store_reduction
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=15] = placeholder[target=ops]
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, buf0, %get_index), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_1 : [num_users=1] = call_method[target=load](args = (%ops, buf1, %get_index_1), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %sub : [num_users=1] = call_method[target=sub](args = (%ops, %load, %load_1), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_2 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_2 : [num_users=1] = call_method[target=load](args = (%ops, buf2, %get_index_2), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %constant : [num_users=1] = call_method[target=constant](args = (%ops, 768.0, torch.float32), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %truediv : [num_users=1] = call_method[target=truediv](args = (%ops, %load_2, %constant), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %constant_1 : [num_users=1] = call_method[target=constant](args = (%ops, 1e-12, torch.float32), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %add : [num_users=1] = call_method[target=add](args = (%ops, %truediv, %constant_1), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %rsqrt : [num_users=1] = call_method[target=rsqrt](args = (%ops, %add), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %mul : [num_users=1] = call_method[target=mul](args = (%ops, %sub, %rsqrt), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_3 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_3 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param3, %get_index_3), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %mul_1 : [num_users=1] = call_method[target=mul](args = (%ops, %mul, %load_3), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_4 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_4 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param4, %get_index_4), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %add_1 : [num_users=1] = call_method[target=add](args = (%ops, %mul_1, %load_4), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_5 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf4, %get_index_5, %add_1, None), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store
V0614 00:48:34.118000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf5 with estimated runtime 0.000000
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf5, %get_index_1, %load, None), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store
V0614 00:48:34.147000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf6 with estimated runtime 0.000000
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf6, %get_index_1, %load, None), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store
V0614 00:48:34.164000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf7 with estimated runtime 0.000000
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf7, %get_index_1, %load, None), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store
V0614 00:48:34.180000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf8 with estimated runtime 0.000000
V0614 00:48:34.180000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf9 with estimated runtime 0.000000
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %load_1 : [num_users=1] = call_method[target=load](args = (%ops, buf8, %get_index_1), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %mul : [num_users=1] = call_method[target=mul](args = (%ops, %load, %load_1), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %get_index_2 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf9, %get_index_2, %mul, None), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1]     return store
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] Output code:
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] # AOT ID: ['3_inference']
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import torch
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import math
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import random
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import os
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import tempfile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from math import inf, nan
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch import device, empty_strided
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] aten = torch.ops.aten
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] async_compile = AsyncCompile()
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param0 = None  # device(type='cpu') torch.float32 (50358, 768) (768, 1) 7fe22c324c70
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param1 = None  # device(type='cpu') torch.float32 (2, 768) (768, 1) 7fe22c324bd0
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param3 = None  # device(type='cpu') torch.float32 (768,) (1,) 7fe22c324b30
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param4 = None  # device(type='cpu') torch.float32 (768,) (1,) 7fe22c324b80
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param6 = None  # device(type='cpu') torch.float32 (1, 832, 768) (638976, 768, 1) 7fe1827beca0
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] cpp_fused_add_cat_embedding_mul_native_layer_norm_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] extern "C" void kernel(const int64_t* in_ptr0,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr1,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const int64_t* in_ptr2,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr3,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr4,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr5,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr6,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr7,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        const float* in_ptr8,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr0,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr1,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr2,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr3,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr4,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr5,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr6,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                        float* out_ptr7)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     #pragma omp parallel num_threads(56)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         int tid = omp_get_thread_num();
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             #pragma omp for
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     Welford<float> tmp_acc0 = Welford<float>();
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp0 = in_ptr0[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp10 = in_ptr2[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp21 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1 + (768L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp1 = 50358L;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp2 = c10::convert<int64_t>(tmp1);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp4 = tmp0 < 0;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp5 = tmp4 ? tmp3 : tmp0;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp6 = tmp5;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp7 = c10::convert<int64_t>(tmp6);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         TORCH_CHECK((0 <= tmp7) & (tmp7 < 50358L), "index out of bounds: 0 <= tmp7 < 50358L");
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp9 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*tmp5)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp11 = 2L;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp12 = c10::convert<int64_t>(tmp11);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp13 = decltype(tmp10)(tmp10 + tmp12);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp14 = tmp10 < 0;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp15 = tmp14 ? tmp13 : tmp10;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp16 = tmp15;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp17 = c10::convert<int64_t>(tmp16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         TORCH_CHECK((0 <= tmp17) & (tmp17 < 2L), "index out of bounds: 0 <= tmp17 < 2L");
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp19 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (768L*tmp15)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp20 = tmp9 + tmp19;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp22 = tmp20 + tmp21;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         tmp22.store(out_ptr0 + static_cast<long>(x1 + (768L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp22, &weight_recps);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp1 = out_ptr1[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp4 = out_ptr2[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr6 + static_cast<long>(x1), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp2 = at::vec::Vectorized<float>(tmp1);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp3 = tmp0 - tmp2;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp5 = static_cast<float>(768.0);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp6 = tmp4 / tmp5;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp7 = static_cast<float>(1e-12);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp8 = decltype(tmp6)(tmp6 + tmp7);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp9 = 1 / std::sqrt(tmp8);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp10 = at::vec::Vectorized<float>(tmp9);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp11 = tmp3 * tmp10;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp13 = tmp11 * tmp12;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     auto tmp15 = tmp13 + tmp14;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     tmp15.store(out_ptr3 + static_cast<long>(x1 + (768L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(64L + x1 + (64L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         tmp0.store(out_ptr4 + static_cast<long>(x1 + (192L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(128L + x1 + (64L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         tmp0.store(out_ptr5 + static_cast<long>(x1 + (192L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(192L + x1 + (64L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         tmp0.store(out_ptr6 + static_cast<long>(x1 + (192L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                             auto tmp0 = in_ptr7[static_cast<long>(128L + x1 + (64L*x0))];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                             auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr8 + static_cast<long>(x2 + (192L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                             auto tmp2 = at::vec::Vectorized<float>(tmp0);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                             auto tmp3 = tmp2 * tmp1;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                             tmp3.store(out_ptr7 + static_cast<long>(x2 + (192L*x1) + (12288L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                         }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                     }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]                 }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]             }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]         }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] ''')
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] async_compile.wait(globals())
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] del async_compile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] def call(args):
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     arg6_1, arg7_1, arg8_1 = args
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     args.clear()
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     assert_size_stride(arg6_1, (1, 832), (832, 1))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     assert_size_stride(arg7_1, (1, 832), (832, 1))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     assert_size_stride(arg8_1, (1, 832), (832, 1))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf1 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf4 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf8 = empty_strided_cpu((1, 9, 192), (1728, 192, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf5 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 0)  # alias
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf6 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 64)  # alias
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf7 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 128)  # alias
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     buf9 = empty_strided_cpu((1, 9, 64, 192), (110592, 12288, 192, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     cpp_fused_add_cat_embedding_mul_native_layer_norm_0(arg6_1, _frozen_param0, arg8_1, _frozen_param1, _frozen_param6, _frozen_param3, _frozen_param4, arg7_1, buf8, buf0, buf1, buf2, buf4, buf5, buf6, buf7, buf9)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     del arg6_1
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     del arg8_1
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     return (buf4, reinterpret_tensor(buf9, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 832, 1), (832, 832, 1, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 1, 832), (832, 832, 832, 1), 0), reinterpret_tensor(arg7_1, (1, 13, 64), (832, 64, 1), 0), )
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     from torch._dynamo.testing import rand_strided
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     from torch._inductor.utils import print_performance
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     global _frozen_param0
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     _frozen_param0 = rand_strided((50358, 768), (768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     global _frozen_param1
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     _frozen_param1 = rand_strided((2, 768), (768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     global _frozen_param3
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     _frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     global _frozen_param4
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     _frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     global _frozen_param6
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     _frozen_param6 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     arg6_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     arg7_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     arg8_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     fn = lambda: call([arg6_1, arg7_1, arg8_1])
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] if __name__ == "__main__":
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]     compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:35.359000 140616046391680 torch/_inductor/graph.py:1714] [5/0_1] Output code written to: /tmp/torchinductor_leslie/qn/cqnbrpdzwchbnkizgxa644pd2tr2v3yro2b26pyq2apqsqovh6ej.py
I0614 00:48:35.359000 140616046391680 torch/_inductor/graph.py:1715] [5/0_1] [__output_code] Output code written to: /tmp/torchinductor_leslie/qn/cqnbrpdzwchbnkizgxa644pd2tr2v3yro2b26pyq2apqsqovh6ej.py
V0614 00:48:35.360000 140616046391680 torch/_inductor/compile_fx.py:531] [5/0_1] FX codegen and compilation took 1.522s
I0614 00:48:35.360000 140616046391680 torch/_dynamo/logging.py:56] [5/0_1] Step 3: torchinductor done compiling FORWARDS graph 3
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1] TRACED GRAPH
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]  ===== FROZEN GRAPH =====
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]     def forward(self, arg0_1: "f32[1, 1, 9, 64, 192]", arg1_1: "f32[1, 1, 832, 1]", arg2_1: "f32[1, 1, 1, 832]"):
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]         return (arg0_1, arg1_1, arg2_1)
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] TRACED GRAPH
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]  ===== FROZEN GRAPH =====
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]     def forward(self, arg6_1: "f32[1, 832, 768]"):
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _frozen_param6: "bf16[768]" = self._frozen_param6
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         convert_element_type_2: "bf16[1, 832, 768]" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16);  arg6_1 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # No stacktrace found for following nodes
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _frozen_param12 = self._frozen_param12
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _linear_pointwise_default_5: "bf16[1, 832, 768]" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], '');  _frozen_param12 = _frozen_param6 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         view_2: "bf16[1, 832, 12, 64]" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]);  _linear_pointwise_default_5 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         permute_1: "bf16[1, 12, 832, 64]" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]);  view_2 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _frozen_param8: "bf16[768]" = self._frozen_param8
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # No stacktrace found for following nodes
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _frozen_param13 = self._frozen_param13
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _linear_pointwise_default_4: "bf16[1, 832, 768]" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], '');  _frozen_param13 = _frozen_param8 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         view_5: "bf16[1, 832, 12, 64]" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]);  _linear_pointwise_default_4 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         permute_3: "bf16[1, 12, 832, 64]" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]);  view_5 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _frozen_param10: "bf16[768]" = self._frozen_param10
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # No stacktrace found for following nodes
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _frozen_param14 = self._frozen_param14
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         _linear_pointwise_default_3: "bf16[1, 832, 768]" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], '');  convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         view_8: "bf16[1, 832, 12, 64]" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]);  _linear_pointwise_default_3 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         permute_5: "bf16[1, 12, 832, 64]" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);  view_8 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]         return (permute_1, permute_3, permute_5)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
I0614 00:48:35.701000 140616046391680 torch/_dynamo/logging.py:56] [9/0_1] Step 3: torchinductor compiling FORWARDS graph 5
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]  ===== AFTER POST GRAD =====
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]     def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _frozen_param6: "bf16[768][1]cpu" = self._frozen_param6
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # No stacktrace found for following nodes
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # No stacktrace found for following nodes
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _frozen_param10: "bf16[768][1]cpu" = self._frozen_param10
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # No stacktrace found for following nodes
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16);  arg6_1 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], '');  _frozen_param12 = _frozen_param6 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]);  _linear_pointwise_default_5 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]);  view_2 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], '');  _frozen_param13 = _frozen_param8 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]);  _linear_pointwise_default_4 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]);  view_5 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         _linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], '');  convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]);  _linear_pointwise_default_3 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);  view_8 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]         return (permute_1, permute_3, permute_5)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.713000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:35.714000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param6 : [num_users=1] = get_attr[target=_frozen_param6]
V0614 00:48:35.714000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param12 : [num_users=1] = get_attr[target=_frozen_param12]
V0614 00:48:35.715000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param8 : [num_users=1] = get_attr[target=_frozen_param8]
V0614 00:48:35.715000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param13 : [num_users=1] = get_attr[target=_frozen_param13]
V0614 00:48:35.715000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param10 : [num_users=1] = get_attr[target=_frozen_param10]
V0614 00:48:35.716000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param14 : [num_users=1] = get_attr[target=_frozen_param14]
V0614 00:48:35.716000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %convert_element_type_2 : [num_users=3] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg6_1, torch.bfloat16), kwargs = {})
V0614 00:48:35.716000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_linear_pointwise_default_5 : [num_users=1] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%convert_element_type_2, %_frozen_param12, %_frozen_param6, none, [], ), kwargs = {})
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function register_onednn_fusion_ops.<locals>.linear_unary at 0x7fe18485d3f0>
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %view_2 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%_linear_pointwise_default_5, [1, 832, 12, 64]), kwargs = {})
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function view at 0x7fe2179c3130>
V0614 00:48:35.719000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_2, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:35.719000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function permute at 0x7fe2179c3370>
V0614 00:48:35.720000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_linear_pointwise_default_4 : [num_users=1] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%convert_element_type_2, %_frozen_param13, %_frozen_param8, none, [], ), kwargs = {})
V0614 00:48:35.721000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function register_onednn_fusion_ops.<locals>.linear_unary at 0x7fe18485d3f0>
V0614 00:48:35.721000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %view_5 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%_linear_pointwise_default_4, [1, 832, 12, 64]), kwargs = {})
V0614 00:48:35.721000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function view at 0x7fe2179c3130>
V0614 00:48:35.722000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %permute_3 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_5, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:35.722000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function permute at 0x7fe2179c3370>
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_linear_pointwise_default_3 : [num_users=1] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%convert_element_type_2, %_frozen_param14, %_frozen_param10, none, [], ), kwargs = {})
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function register_onednn_fusion_ops.<locals>.linear_unary at 0x7fe18485d3f0>
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %view_8 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%_linear_pointwise_default_3, [1, 832, 12, 64]), kwargs = {})
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function view at 0x7fe2179c3130>
V0614 00:48:35.724000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %permute_5 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_8, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:35.724000 140616046391680 torch/_inductor/graph.py:976] [9/0_1]   via <function permute at 0x7fe2179c3370>
V0614 00:48:35.725000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering return (permute_1, permute_3, permute_5)
V0614 00:48:35.725000 140616046391680 torch/_inductor/graph.py:1097] [9/0_1] Force channels last inputs for 0 conv for the current graph with id 5
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   'cpu',
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   torch.bfloat16,
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   def inner_fn(index):
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]       _, i1, i2 = index
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]       tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]       tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]       return tmp1
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   ,
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   ranges=[1, 832, 768],
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origin_node=convert_element_type_2,
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origins={convert_element_type_2}
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ))
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling LinearUnary(
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   python_kernel_name='torch.ops.mkldnn._linear_pointwise',
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   name=buf1,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]),
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   inputs=[ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     'cpu',
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     torch.bfloat16,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     def inner_fn(index):
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         _, i1, i2 = index
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         return tmp1
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     ,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     ranges=[1, 832, 768],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     origin_node=convert_element_type_2,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     origins={convert_element_type_2}
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   )), ConstantBuffer(name='_frozen_param12', layout=FixedLayout('cpu', torch.bfloat16, size=[768, 768], stride=[1, 0])), ConstantBuffer(name='_frozen_param6', layout=FixedLayout('cpu', torch.bfloat16, size=[768], stride=[1]))],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   constant_args=['none', [-1], ''],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   kwargs={},
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   output_view=None,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   python_kernel_name=torch.ops.mkldnn._linear_pointwise,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   cpp_kernel_name=mkldnn::_linear_pointwise,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   op_overload=None,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   arg_properties=[{}, {}, {}],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   kwarg_properties=None,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   unbacked_bindings={},
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origin_node=_linear_pointwise_default_5,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origins={_linear_pointwise_default_5}
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling LinearUnary(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   python_kernel_name='torch.ops.mkldnn._linear_pointwise',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   name=buf2,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   inputs=[ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     'cpu',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     torch.bfloat16,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     def inner_fn(index):
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         _, i1, i2 = index
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         return tmp1
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     ,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     ranges=[1, 832, 768],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     origin_node=convert_element_type_2,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     origins={convert_element_type_2}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   )), ConstantBuffer(name='_frozen_param13', layout=FixedLayout('cpu', torch.bfloat16, size=[768, 768], stride=[1, 0])), ConstantBuffer(name='_frozen_param8', layout=FixedLayout('cpu', torch.bfloat16, size=[768], stride=[1]))],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   constant_args=['none', [-1], ''],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   kwargs={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   output_view=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   python_kernel_name=torch.ops.mkldnn._linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   cpp_kernel_name=mkldnn::_linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   op_overload=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   arg_properties=[{}, {}, {}],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   kwarg_properties=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   unbacked_bindings={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origin_node=_linear_pointwise_default_4,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origins={_linear_pointwise_default_4}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling LinearUnary(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   python_kernel_name='torch.ops.mkldnn._linear_pointwise',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   name=buf3,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   inputs=[ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     'cpu',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     torch.bfloat16,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     def inner_fn(index):
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         _, i1, i2 = index
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]         return tmp1
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     ,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     ranges=[1, 832, 768],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     origin_node=convert_element_type_2,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]     origins={convert_element_type_2}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   )), ConstantBuffer(name='_frozen_param14', layout=FixedLayout('cpu', torch.bfloat16, size=[768, 768], stride=[1, 0])), ConstantBuffer(name='_frozen_param10', layout=FixedLayout('cpu', torch.bfloat16, size=[768], stride=[1]))],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   constant_args=['none', [-1], ''],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   kwargs={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   output_view=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   python_kernel_name=torch.ops.mkldnn._linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   cpp_kernel_name=mkldnn::_linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   op_overload=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   arg_properties=[{}, {}, {}],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   kwarg_properties=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   unbacked_bindings={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origin_node=_linear_pointwise_default_3,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1]   origins={_linear_pointwise_default_3}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:1671] [9/0_1] scheduling output buf1
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:1671] [9/0_1] scheduling output buf2
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:1671] [9/0_1] scheduling output buf3
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf0 with estimated runtime 0.000000
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] get_bounds:
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] graph():
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     %ops : [num_users=3] = placeholder[target=ops]
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     %load : [num_users=1] = call_method[target=load](args = (%ops, arg6_1, %get_index), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     %to_dtype_1 : [num_users=1] = call_method[target=to_dtype](args = (%ops, %load, torch.bfloat16), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index_1, %to_dtype_1, None), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1]     return store
V0614 00:48:35.738000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf1 with estimated runtime 0.000000
V0614 00:48:35.739000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf2 with estimated runtime 0.000000
V0614 00:48:35.740000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf3 with estimated runtime 0.000000
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] Output code:
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] # AOT ID: ['5_inference']
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import torch
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import math
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import random
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import os
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import tempfile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from math import inf, nan
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch import device, empty_strided
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] aten = torch.ops.aten
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] async_compile = AsyncCompile()
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param6 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7fe181de3b50
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param12 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7fe181dfce00
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param8 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7fe181dddcb0
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param13 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7fe181dd1b20
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param10 = None  # device(type='cpu') torch.bfloat16 (768,) (1,) 7fe181de9800
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param14 = None  # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7fe181dcbec0
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] extern "C" void kernel(const float* in_ptr0,
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]                        bfloat16* out_ptr0)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     #pragma omp parallel num_threads(56)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]         int tid = omp_get_thread_num();
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]         {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]             #pragma omp for
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]             for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]             {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]                 auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]                 auto tmp1 = at::vec::convert<bfloat16>(tmp0);
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]                 tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]             }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]         }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] ''')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] async_compile.wait(globals())
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] del async_compile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] def call(args):
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     arg6_1, = args
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     args.clear()
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     cpp_fused__to_copy_0(arg6_1, buf0)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     del arg6_1
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     from torch._dynamo.testing import rand_strided
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     from torch._inductor.utils import print_performance
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     global _frozen_param6
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     _frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     global _frozen_param12
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     _frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     global _frozen_param8
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     _frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     global _frozen_param13
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     _frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     global _frozen_param10
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     _frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     global _frozen_param14
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     _frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     fn = lambda: call([arg6_1])
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] if __name__ == "__main__":
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]     compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:36.780000 140616046391680 torch/_inductor/graph.py:1714] [9/0_1] Output code written to: /tmp/torchinductor_leslie/5n/c5nsnj5quh3sljkcq2l2mq7xpgfj4dvyyyyy2kcwvnmub5t6ujeu.py
I0614 00:48:36.781000 140616046391680 torch/_inductor/graph.py:1715] [9/0_1] [__output_code] Output code written to: /tmp/torchinductor_leslie/5n/c5nsnj5quh3sljkcq2l2mq7xpgfj4dvyyyyy2kcwvnmub5t6ujeu.py
V0614 00:48:36.782000 140616046391680 torch/_inductor/compile_fx.py:531] [9/0_1] FX codegen and compilation took 1.082s
I0614 00:48:36.782000 140616046391680 torch/_dynamo/logging.py:56] [9/0_1] Step 3: torchinductor done compiling FORWARDS graph 5
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] TRACED GRAPH
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]  ===== FROZEN GRAPH =====
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]     def forward(self):
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]         _frozen_param0: "i64[2]" = self._frozen_param0
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]         return (_frozen_param0,)
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] TRACED GRAPH
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]  ===== FROZEN GRAPH =====
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]     def forward(self, arg0_1: "i32[13, 3]", arg1_1: "i32[13, 3]", arg2_1: "i32[13, 3]", arg3_1: "i32[13, 3]", arg4_1: "i32[13, 3]", arg5_1: "i32[13, 3]", arg6_1: "i32[13, 3]", arg7_1: "i32[13, 3]", arg8_1: "i32[13, 3]", arg9_1: "i32[13, 3]", arg10_1: "i32[13, 3]", arg11_1: "i32[13, 3]"):
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_1: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12);  arg0_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_3: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12);  arg1_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_5: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12);  arg2_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_7: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12);  arg3_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_9: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12);  arg4_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_11: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12);  arg5_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_13: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12);  arg6_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_15: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12);  arg7_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_17: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12);  arg8_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_19: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12);  arg9_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_21: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12);  arg10_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         slice_23: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12);  arg11_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]         return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]
I0614 00:48:37.088000 140616046391680 torch/_dynamo/logging.py:56] [14/0] Step 3: torchinductor compiling FORWARDS graph 7
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]  ===== AFTER POST GRAD =====
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]     def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12);  arg0_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12);  arg1_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12);  arg2_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12);  arg3_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12);  arg4_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12);  arg5_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12);  arg6_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12);  arg7_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12);  arg8_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12);  arg9_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12);  arg10_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12);  arg11_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]         return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]
V0614 00:48:37.096000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:37.097000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg4_1 : [num_users=1] = placeholder[target=arg4_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg5_1 : [num_users=1] = placeholder[target=arg5_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg7_1 : [num_users=1] = placeholder[target=arg7_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg8_1 : [num_users=1] = placeholder[target=arg8_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg9_1 : [num_users=1] = placeholder[target=arg9_1]
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg10_1 : [num_users=1] = placeholder[target=arg10_1]
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg11_1 : [num_users=1] = placeholder[target=arg11_1]
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg0_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.101000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg1_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.102000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.102000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_5 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg2_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.102000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.103000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_7 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg3_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.103000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.104000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_9 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg4_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.104000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.104000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_11 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg5_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.105000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.105000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_13 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg6_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.105000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.106000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_15 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg7_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.106000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.107000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_17 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg8_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.107000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_19 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg9_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_21 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg10_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.109000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_23 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg11_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.109000 140616046391680 torch/_inductor/graph.py:976] [14/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.110000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0614 00:48:37.110000 140616046391680 torch/_inductor/graph.py:1097] [14/0] Force channels last inputs for 0 conv for the current graph with id 7
V0614 00:48:37.111000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg0_1
V0614 00:48:37.111000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg1_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg2_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg3_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg4_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg5_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg6_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg7_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg8_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg9_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg10_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg11_1
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] Output code:
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] # AOT ID: ['7_inference']
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import torch
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import math
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import random
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import os
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import tempfile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from math import inf, nan
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch import device, empty_strided
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] aten = torch.ops.aten
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] async_compile = AsyncCompile()
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] async_compile.wait(globals())
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] del async_compile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] def call(args):
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     args.clear()
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg0_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg1_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg2_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg3_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg4_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg5_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg6_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg7_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg8_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg9_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg10_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     assert_size_stride(arg11_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     from torch._dynamo.testing import rand_strided
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     from torch._inductor.utils import print_performance
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] if __name__ == "__main__":
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]     compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.115000 140616046391680 torch/_inductor/graph.py:1714] [14/0] Output code written to: /tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py
I0614 00:48:37.115000 140616046391680 torch/_inductor/graph.py:1715] [14/0] [__output_code] Output code written to: /tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py
V0614 00:48:37.115000 140616046391680 torch/_inductor/compile_fx.py:531] [14/0] FX codegen and compilation took 0.027s
I0614 00:48:37.115000 140616046391680 torch/_dynamo/logging.py:56] [14/0] Step 3: torchinductor done compiling FORWARDS graph 7
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] TRACED GRAPH
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]  ===== FROZEN GRAPH =====
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]     def forward(self, arg0_1: "i32[11, 3]", arg1_1: "i32[11, 3]", arg2_1: "i32[11, 3]", arg3_1: "i32[11, 3]", arg4_1: "i32[11, 3]", arg5_1: "i32[11, 3]", arg6_1: "i32[11, 3]", arg7_1: "i32[11, 3]", arg8_1: "i32[11, 3]", arg9_1: "i32[11, 3]", arg10_1: "i32[11, 3]", arg11_1: "i32[11, 3]", arg12_1: "bf16[1, 12, 832, 64]", arg13_1: "f32[1, 13, 64]", arg14_1: "bf16[1, 12, 832, 64]", arg15_1: "bf16[1, 12, 832, 64]", arg16_1: "f32[1, 1, 1, 832]", arg17_1: "f32[1, 1, 9, 64, 192]", arg18_1: "f32[1, 1, 832, 1]"):
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat: "i32[132, 3]" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]);  arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view: "i32[12, 11, 3]" = torch.ops.aten.reshape.default(cat, [12, 11, 3]);  cat = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type: "i64[12, 11, 3]" = torch.ops.prims.convert_element_type.default(view, torch.int64);  view = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze: "i64[1, 12, 11, 3]" = torch.ops.aten.unsqueeze.default(convert_element_type, 0);  convert_element_type = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select: "f32[13, 64]" = torch.ops.aten.select.int(arg13_1, 0, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_1: "i64[12, 11, 3]" = torch.ops.aten.select.int(unsqueeze, 0, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_1: "i64[396]" = torch.ops.aten.reshape.default(select_1, [396]);  select_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         index: "f32[396, 64]" = torch.ops.aten.index.Tensor(select, [view_1]);  select = view_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_2: "f32[1, 396, 64]" = torch.ops.aten.reshape.default(index, [1, 396, 64]);  index = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_3: "f32[1, 12, 11, 192]" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]);  view_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_2: "f32[1, 11, 64]" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1);  arg13_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_1: "f32[1, 11, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_2, 3);  slice_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_2: "f32[1, 11, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4);  unsqueeze_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute: "f32[1, 1, 11, 64, 1]" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]);  unsqueeze_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_3: "f32[1, 12, 11, 192, 1]" = torch.ops.aten.unsqueeze.default(view_3, 4);  view_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_1: "f32[1, 12, 11, 1, 192]" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]);  unsqueeze_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul: "f32[1, 12, 11, 64, 192]" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_4: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]);  arg12_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_5: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_6: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         iota: "i64[396]" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         div: "i64[396]" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor');  iota = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_1: "i64[396]" = torch.ops.aten.mul.Tensor(div, 13);  div = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_7: "i64[396]" = torch.ops.aten.reshape.default(unsqueeze, [-1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add: "i64[396]" = torch.ops.aten.add.Tensor(view_7, mul_1);  view_7 = mul_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         clone_2: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_8: "bf16[156, 64, 64]" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]);  clone_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         index_1: "bf16[396, 64, 64]" = torch.ops.aten.index.Tensor(view_8, [add]);  view_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_9: "bf16[1, 12, 33, 64, 64]" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]);  index_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_10: "bf16[1, 12, 11, 192, 64]" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]);  view_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         clone_3: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_12: "bf16[156, 64, 64]" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]);  clone_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         index_2: "bf16[396, 64, 64]" = torch.ops.aten.index.Tensor(view_12, [add]);  view_12 = add = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_13: "bf16[1, 12, 33, 64, 64]" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]);  index_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_14: "bf16[1, 12, 11, 192, 64]" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]);  view_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_2: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_15: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]);  select_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_16: "bf16[12, 832, 64]" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64]);  arg14_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_2: "bf16[12, 64, 832]" = torch.ops.aten.permute.default(view_16, [0, 2, 1]);  view_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm: "bf16[12, 64, 832]" = torch.ops.aten.bmm.default(view_15, permute_2);  view_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_17: "bf16[1, 12, 64, 832]" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]);  bmm = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_3: "bf16[1, 12, 64, 832]" = torch.ops.aten.mul.Tensor(view_17, 0.125);  view_17 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub: "f32[1, 1, 1, 832]" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_4: "f32[1, 1, 1, 832]" = torch.ops.aten.mul.Tensor(sub, -10000.0);  sub = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_2: "f32[1, 12, 64, 832]" = torch.ops.aten.add.Tensor(mul_3, mul_4);  mul_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         amax: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_2, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_1: "f32[1, 12, 64, 832]" = torch.ops.aten.sub.Tensor(add_2, amax);  add_2 = amax = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         exp: "f32[1, 12, 64, 832]" = torch.ops.aten.exp.default(sub_1);  sub_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sum_1: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         div_2: "f32[1, 12, 64, 832]" = torch.ops.aten.div.Tensor(exp, sum_1);  exp = sum_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_5: "bf16[1, 12, 64, 832]" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16);  div_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_18: "bf16[12, 64, 832]" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]);  convert_element_type_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_19: "bf16[12, 832, 64]" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64]);  arg15_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_1: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_18, view_19);  view_18 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_20: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]);  bmm_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_4: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_20, 2);  view_20 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_3: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_4: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, 1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_5: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_6: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_7: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_10, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_1: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2);  select_4 = select_5 = select_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_8: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_9: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, 1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_10: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_11: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_12: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_14, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_2: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2);  select_9 = select_10 = select_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_13: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, 1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_21: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]);  select_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_22: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]);  cat_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_3: "bf16[12, 64, 448]" = torch.ops.aten.permute.default(view_22, [0, 2, 1]);  view_22 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_2: "bf16[12, 64, 448]" = torch.ops.aten.bmm.default(view_21, permute_3);  view_21 = permute_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_23: "bf16[1, 12, 64, 448]" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]);  bmm_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_30: "f32[1, 1, 1, 192]" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_34: "f32[1, 1, 1, 64]" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         full_default: "f32[1, 1, 1, 192]" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_3: "f32[1, 1, 1, 448]" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3);  slice_30 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         full_default_1: "f32[1, 12, 64, 256]" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_14: "f32[1, 12, 64, 192]" = torch.ops.aten.select.int(mul, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_4: "f32[1, 12, 64, 448]" = torch.ops.aten.cat.default([full_default_1, select_14], 3);  select_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_5: "bf16[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(view_23, 0.125);  view_23 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         minimum: "f32[1, 12, 64, 448]" = torch.ops.aten.minimum.default(cat_3, cat_4);  cat_3 = cat_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_2: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(1.0, minimum);  minimum = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_6: "f32[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(sub_2, -10000.0);  sub_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_3: "f32[1, 12, 64, 448]" = torch.ops.aten.add.Tensor(mul_5, mul_6);  mul_5 = mul_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         amax_1: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_3, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_3: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(add_3, amax_1);  add_3 = amax_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         exp_1: "f32[1, 12, 64, 448]" = torch.ops.aten.exp.default(sub_3);  sub_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sum_2: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         div_3: "f32[1, 12, 64, 448]" = torch.ops.aten.div.Tensor(exp_1, sum_2);  exp_1 = sum_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_12: "bf16[1, 12, 64, 448]" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16);  div_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_24: "bf16[12, 64, 448]" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]);  convert_element_type_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_25: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]);  cat_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_3: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_24, view_25);  view_24 = view_25 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_26: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]);  bmm_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_5: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_26, 2);  view_26 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_39: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_42: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_45: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_5: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3);  slice_39 = slice_42 = slice_45 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_48: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_51: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_54: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_6: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3);  slice_48 = slice_51 = slice_54 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_57: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         clone_4: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_27: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]);  clone_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_28: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]);  cat_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_4: "bf16[108, 64, 192]" = torch.ops.aten.permute.default(view_28, [0, 2, 1]);  view_28 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_4: "bf16[108, 64, 192]" = torch.ops.aten.bmm.default(view_27, permute_4);  permute_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_29: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]);  bmm_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_7: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.mul.Tensor(view_29, 0.125);  view_29 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_60: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         clone_6: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format);  slice_60 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_31: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]);  clone_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_5: "bf16[108, 64, 192]" = torch.ops.aten.permute.default(view_31, [0, 2, 1]);  view_31 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_5: "bf16[108, 64, 192]" = torch.ops.aten.bmm.default(view_27, permute_5);  view_27 = permute_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_32: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]);  bmm_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_8: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.mul.Tensor(view_32, 0.125);  view_32 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_57, 5);  slice_57 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_6: "bf16[1, 12, 9, 64, 1, 64]" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]);  unsqueeze_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_7: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_3, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5);  unsqueeze_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_7: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]);  unsqueeze_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_8: "bf16[12, 9, 64, 64, 1, 1]" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]);  permute_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_33: "bf16[12, 576, 64]" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]);  permute_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_9: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]);  permute_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_34: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]);  permute_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_6: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_33, view_34);  view_34 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_35: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]);  bmm_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_10: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]);  view_35 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_36: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]);  permute_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_9: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.mul.Tensor(view_36, 0.125);  view_36 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_10: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_6, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5);  unsqueeze_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_12: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]);  unsqueeze_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_14: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]);  permute_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_38: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]);  permute_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_7: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_33, view_38);  view_33 = view_38 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_39: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]);  bmm_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_15: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]);  view_39 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_40: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]);  permute_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_10: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.mul.Tensor(view_40, 0.125);  view_40 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_4: "f32[1, 1, 9, 64, 192]" = torch.ops.aten.sub.Tensor(1.0, arg17_1);  arg17_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_11: "f32[1, 1, 9, 64, 192]" = torch.ops.aten.mul.Tensor(sub_4, -10000.0);  sub_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_4: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.add.Tensor(mul_7, mul_11);  mul_7 = mul_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_23: "bf16[1, 12, 9, 64, 192]" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16);  add_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_68: "f32[1, 1, 1, 64]" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_12: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.unsqueeze.default(slice_68, 3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_5: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12);  unsqueeze_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_12: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.mul.Tensor(sub_5, -10000.0);  sub_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_5: "f32[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(mul_9, mul_12);  mul_9 = mul_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_24: "bf16[1, 12, 9, 64, 64]" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16);  add_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_13: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.unsqueeze.default(slice_34, 3);  slice_34 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_6: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13);  unsqueeze_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_13: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.mul.Tensor(sub_6, -10000.0);  sub_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_6: "f32[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(mul_10, mul_13);  mul_10 = mul_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_25: "bf16[1, 12, 9, 64, 64]" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16);  add_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_75: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_7: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.sub.Tensor(1.0, slice_75);  slice_75 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_14: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.mul.Tensor(sub_7, -10000.0);  sub_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_7: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.add.Tensor(mul_8, mul_14);  mul_8 = mul_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_26: "bf16[1, 12, 9, 64, 192]" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16);  add_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_7: "bf16[1, 12, 9, 64, 512]" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1);  convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_27: "f32[1, 12, 9, 64, 512]" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32);  cat_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         amax_2: "f32[1, 12, 9, 64, 1]" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_8: "f32[1, 12, 9, 64, 512]" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2);  convert_element_type_27 = amax_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         exp_2: "f32[1, 12, 9, 64, 512]" = torch.ops.aten.exp.default(sub_8);  sub_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sum_3: "f32[1, 12, 9, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         div_4: "f32[1, 12, 9, 64, 512]" = torch.ops.aten.div.Tensor(exp_2, sum_3);  exp_2 = sum_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_28: "bf16[1, 12, 9, 64, 512]" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16);  div_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_80: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_41: "bf16[108, 64, 192]" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]);  slice_80 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_42: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]);  cat_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_8: "bf16[108, 64, 64]" = torch.ops.aten.bmm.default(view_41, view_42);  view_41 = view_42 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_43: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]);  bmm_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_85: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_88: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_44: "bf16[108, 64, 192]" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]);  slice_85 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         clone_7: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format);  slice_88 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_45: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]);  clone_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_9: "bf16[108, 64, 64]" = torch.ops.aten.bmm.default(view_44, view_45);  view_44 = view_45 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_46: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]);  bmm_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_8: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(view_43, view_46);  view_43 = view_46 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_47: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]);  add_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_48: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]);  view_47 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_93: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_93, 5);  slice_93 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_16: "bf16[1, 12, 9, 64, 1, 64]" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]);  unsqueeze_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_15: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_8, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5);  unsqueeze_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_17: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]);  unsqueeze_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_18: "bf16[12, 9, 64, 64, 1, 1]" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]);  permute_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_49: "bf16[12, 576, 64]" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]);  permute_18 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_19: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]);  permute_17 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_50: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]);  permute_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_10: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_49, view_50);  view_49 = view_50 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_51: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]);  bmm_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_20: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]);  view_51 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_52: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]);  permute_20 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_9: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(view_48, view_52);  view_48 = view_52 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_53: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]);  add_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_54: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]);  view_53 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_100: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807);  convert_element_type_28 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_100, 5);  slice_100 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_21: "bf16[1, 12, 9, 64, 1, 64]" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]);  unsqueeze_17 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_18: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_11, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5);  unsqueeze_18 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_22: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]);  unsqueeze_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_23: "bf16[12, 9, 64, 64, 1, 1]" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]);  permute_21 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_55: "bf16[12, 576, 64]" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]);  permute_23 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_24: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]);  permute_22 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_56: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]);  permute_24 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_11: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_55, view_56);  view_55 = view_56 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_57: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]);  bmm_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_25: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]);  view_57 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_58: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]);  permute_25 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_10: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(view_54, view_58);  view_54 = view_58 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_59: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]);  add_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_60: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]);  view_59 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_20: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_21: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, -2);  view_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_23: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_10, 2, -1);  view_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_8: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2);  select_3 = select_20 = select_21 = select_6 = select_23 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_25: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_26: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, -2);  view_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_28: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_14, 2, -1);  view_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_9: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2);  select_8 = select_25 = select_26 = select_11 = select_28 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_29: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_61: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]);  select_29 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_62: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]);  cat_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_26: "bf16[12, 64, 448]" = torch.ops.aten.permute.default(view_62, [0, 2, 1]);  view_62 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_12: "bf16[12, 64, 448]" = torch.ops.aten.bmm.default(view_61, permute_26);  view_61 = permute_26 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_63: "bf16[1, 12, 64, 448]" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]);  bmm_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         slice_132: "f32[1, 1, 1, 192]" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807);  arg16_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_10: "f32[1, 1, 1, 448]" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3);  slice_68 = slice_132 = full_default = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_30: "f32[1, 12, 64, 192]" = torch.ops.aten.select.int(mul, 2, -1);  mul = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_11: "f32[1, 12, 64, 448]" = torch.ops.aten.cat.default([full_default_1, select_30], 3);  full_default_1 = select_30 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_15: "bf16[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(view_63, 0.125);  view_63 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         minimum_1: "f32[1, 12, 64, 448]" = torch.ops.aten.minimum.default(cat_10, cat_11);  cat_10 = cat_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_9: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(1.0, minimum_1);  minimum_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_16: "f32[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(sub_9, -10000.0);  sub_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_11: "f32[1, 12, 64, 448]" = torch.ops.aten.add.Tensor(mul_15, mul_16);  mul_15 = mul_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         amax_3: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_11, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_10: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(add_11, amax_3);  add_11 = amax_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         exp_3: "f32[1, 12, 64, 448]" = torch.ops.aten.exp.default(sub_10);  sub_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sum_4: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         div_5: "f32[1, 12, 64, 448]" = torch.ops.aten.div.Tensor(exp_3, sum_4);  exp_3 = sum_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_41: "bf16[1, 12, 64, 448]" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16);  div_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_64: "bf16[12, 64, 448]" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]);  convert_element_type_41 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_65: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]);  cat_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_13: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_64, view_65);  view_64 = view_65 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_66: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]);  bmm_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_20: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_66, 2);  view_66 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         select_31: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, -1);  view_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_67: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]);  select_31 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_14: "bf16[12, 64, 832]" = torch.ops.aten.bmm.default(view_67, permute_2);  view_67 = permute_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_69: "bf16[1, 12, 64, 832]" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]);  bmm_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_17: "bf16[1, 12, 64, 832]" = torch.ops.aten.mul.Tensor(view_69, 0.125);  view_69 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         add_12: "f32[1, 12, 64, 832]" = torch.ops.aten.add.Tensor(mul_17, mul_4);  mul_17 = mul_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1)  # [bsz, n_heads, from_block_size, n]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         amax_4: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_12, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sub_12: "f32[1, 12, 64, 832]" = torch.ops.aten.sub.Tensor(add_12, amax_4);  add_12 = amax_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         exp_4: "f32[1, 12, 64, 832]" = torch.ops.aten.exp.default(sub_12);  sub_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         sum_5: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         div_6: "f32[1, 12, 64, 832]" = torch.ops.aten.div.Tensor(exp_4, sum_5);  exp_4 = sum_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         convert_element_type_48: "bf16[1, 12, 64, 832]" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16);  div_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_70: "bf16[12, 64, 832]" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]);  convert_element_type_48 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         bmm_15: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_70, view_19);  view_70 = view_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_72: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]);  bmm_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         unsqueeze_21: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_72, 2);  view_72 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         cat_12: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2);  unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         view_73: "bf16[1, 12, 832, 64]" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]);  cat_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         mul_19: "f32[1, 12, 832, 64]" = torch.ops.aten.mul.Tensor(view_73, arg18_1);  view_73 = arg18_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         permute_28: "f32[1, 832, 12, 64]" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]);  mul_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]         return (permute_28, unsqueeze)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
I0614 00:48:38.377000 140616046391680 torch/_dynamo/logging.py:56] [15/0] Step 3: torchinductor compiling FORWARDS graph 8
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]  ===== AFTER POST GRAD =====
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]  /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]     def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]);  arg12_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]);  select_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]);  view_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2);  view_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]);  bmm = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125);  view_17 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0);  sub = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4);  mul_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_2, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, amax);  add_2 = amax = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1);  sub_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1);  exp = sum_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16);  div_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]);  convert_element_type_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19);  view_18 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]);  bmm_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2);  view_20 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]);  select_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1]);  arg14_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]);  clone_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]);  arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.reshape.default(cat, [12, 11, 3]);  cat = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(view, torch.int64);  view = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0);  convert_element_type = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_7: "i64[396][1]cpu" = torch.ops.aten.reshape.default(unsqueeze, [-1])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor');  iota = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13);  div = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1);  view_7 = mul_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]);  view_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]);  index_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]);  view_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2);  select_4 = select_5 = select_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]);  cat_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]);  view_22 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3);  view_21 = permute_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]);  bmm_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125);  view_23 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         full_default: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3);  slice_30 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         full_default_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3);  slice_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4);  unsqueeze_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]);  unsqueeze_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0);  arg13_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(unsqueeze, 0, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_1: "i64[396][1]cpu" = torch.ops.aten.reshape.default(select_1, [396]);  select_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]);  select = view_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.reshape.default(index, [1, 396, 64]);  index = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]);  view_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4);  view_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]);  unsqueeze_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1);  permute = permute_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_14], 3);  select_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4);  cat_3 = cat_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum);  minimum = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0);  sub_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6);  mul_5 = mul_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_3, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_3, amax_1);  add_3 = amax_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3);  sub_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2);  exp_1 = sum_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16);  div_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]);  convert_element_type_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1]);  arg15_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]);  clone_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add]);  view_12 = add = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]);  index_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]);  view_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2);  select_9 = select_10 = select_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]);  cat_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25);  view_24 = view_25 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]);  bmm_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2);  view_26 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]);  unsqueeze_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]);  permute_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]);  permute_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_3, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5);  unsqueeze_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]);  unsqueeze_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]);  permute_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]);  permute_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34);  view_34 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]);  bmm_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]);  view_35 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]);  permute_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125);  view_36 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12);  unsqueeze_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0);  sub_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12);  mul_9 = mul_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16);  add_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format);  slice_57 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]);  clone_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3);  slice_39 = slice_42 = slice_45 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]);  cat_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]);  view_28 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4);  permute_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]);  bmm_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125);  view_29 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1);  arg17_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0);  sub_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11);  mul_7 = mul_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16);  add_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format);  slice_60 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]);  clone_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]);  view_31 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_5);  view_27 = permute_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]);  bmm_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125);  view_32 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75);  slice_75 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0);  sub_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14);  mul_8 = mul_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16);  add_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_6, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5);  unsqueeze_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]);  unsqueeze_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]);  permute_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]);  permute_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_38);  view_33 = view_38 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]);  bmm_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]);  view_39 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]);  permute_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125);  view_40 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_34, 3);  slice_34 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13);  unsqueeze_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0);  sub_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13);  mul_10 = mul_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16);  add_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1);  convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32);  cat_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2);  convert_element_type_27 = amax_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8);  sub_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3);  exp_2 = sum_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16);  div_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]);  slice_80 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3);  slice_48 = slice_51 = slice_54 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]);  cat_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42);  view_41 = view_42 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]);  bmm_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]);  slice_85 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format);  slice_88 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]);  clone_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45);  view_44 = view_45 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]);  bmm_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46);  view_43 = view_46 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]);  add_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]);  view_47 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5);  slice_93 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]);  unsqueeze_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]);  permute_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]);  permute_18 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_8, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5);  unsqueeze_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]);  unsqueeze_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]);  permute_17 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]);  permute_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50);  view_49 = view_50 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]);  bmm_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]);  view_51 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]);  permute_20 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52);  view_48 = view_52 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]);  add_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]);  view_53 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807);  convert_element_type_28 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5);  slice_100 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]);  unsqueeze_17 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]);  permute_21 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]);  permute_23 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_11, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5);  unsqueeze_18 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]);  unsqueeze_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]);  permute_22 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]);  permute_24 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56);  view_55 = view_56 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]);  bmm_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]);  view_57 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]);  permute_25 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58);  view_54 = view_58 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]);  add_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]);  view_59 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]);  select_29 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -2);  view_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, -1);  view_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2);  select_3 = select_20 = select_21 = select_6 = select_23 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]);  cat_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]);  view_62 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26);  view_61 = permute_26 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]);  bmm_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125);  view_63 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807);  arg16_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3);  slice_68 = slice_132 = full_default = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, -1);  mul = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_30], 3);  full_default_1 = select_30 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11);  cat_10 = cat_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1);  minimum_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0);  sub_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16);  mul_15 = mul_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_11, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_11, amax_3);  add_11 = amax_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10);  sub_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4);  exp_3 = sum_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16);  div_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]);  convert_element_type_41 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -2);  view_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, -1);  view_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2);  select_8 = select_25 = select_26 = select_11 = select_28 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]);  cat_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65);  view_64 = view_65 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]);  bmm_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2);  view_66 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -1);  view_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]);  select_31 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_2);  view_67 = permute_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]);  bmm_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125);  view_69 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_4);  mul_17 = mul_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1)  # [bsz, n_heads, from_block_size, n]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_12, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_12, amax_4);  add_12 = amax_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12);  sub_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5);  exp_4 = sum_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16);  div_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]);  convert_element_type_48 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_19);  view_70 = view_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]);  bmm_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2);  view_72 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2);  unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]);  cat_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1);  view_73 = arg18_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]);  mul_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]         return (permute_28, unsqueeze)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.471000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:38.471000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
V0614 00:48:38.471000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
V0614 00:48:38.472000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
V0614 00:48:38.472000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg4_1 : [num_users=1] = placeholder[target=arg4_1]
V0614 00:48:38.472000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg5_1 : [num_users=1] = placeholder[target=arg5_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg7_1 : [num_users=1] = placeholder[target=arg7_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg8_1 : [num_users=1] = placeholder[target=arg8_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg9_1 : [num_users=1] = placeholder[target=arg9_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg10_1 : [num_users=1] = placeholder[target=arg10_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg11_1 : [num_users=1] = placeholder[target=arg11_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg12_1 : [num_users=1] = placeholder[target=arg12_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg13_1 : [num_users=2] = placeholder[target=arg13_1]
V0614 00:48:38.475000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg14_1 : [num_users=2] = placeholder[target=arg14_1]
V0614 00:48:38.475000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg15_1 : [num_users=2] = placeholder[target=arg15_1]
V0614 00:48:38.475000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg16_1 : [num_users=5] = placeholder[target=arg16_1]
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg17_1 : [num_users=1] = placeholder[target=arg17_1]
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg18_1 : [num_users=1] = placeholder[target=arg18_1]
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_4 : [num_users=5] = call_function[target=torch.ops.aten.reshape.default](args = (%arg12_1, [1, 12, 13, 64, -1]), kwargs = {})
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.477000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_2 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, 0), kwargs = {})
V0614 00:48:38.478000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_15 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_2, [12, 64, 64]), kwargs = {})
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_16 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg14_1, [12, 832, 64]), kwargs = {})
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_2 : [num_users=2] = call_function[target=torch.ops.aten.permute.default](args = (%view_16, [0, 2, 1]), kwargs = {})
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_15, %permute_2), kwargs = {})
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.482000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.483000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm, [1, 12, 64, 832]), kwargs = {})
V0614 00:48:38.483000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.484000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_17, 0.125), kwargs = {})
V0614 00:48:38.484000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.485000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %arg16_1), kwargs = {})
V0614 00:48:38.485000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.486000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_4 : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, -10000.0), kwargs = {})
V0614 00:48:38.486000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.487000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_2 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %mul_4), kwargs = {})
V0614 00:48:38.487000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.488000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_2, [-1], True), kwargs = {})
V0614 00:48:38.489000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.489000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_2, %amax), kwargs = {})
V0614 00:48:38.489000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.492000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_1,), kwargs = {})
V0614 00:48:38.492000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.494000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp, [-1], True), kwargs = {})
V0614 00:48:38.494000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.495000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_2 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp, %sum_1), kwargs = {})
V0614 00:48:38.495000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function div at 0x7fe2165656c0>
V0614 00:48:38.497000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_5 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_2, torch.bfloat16), kwargs = {})
V0614 00:48:38.497000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.498000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_18 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_5, [-1, 64, 832]), kwargs = {})
V0614 00:48:38.498000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.498000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_19 : [num_users=2] = call_function[target=torch.ops.aten.reshape.default](args = (%arg15_1, [12, 832, 64]), kwargs = {})
V0614 00:48:38.499000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.499000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_1 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_18, %view_19), kwargs = {})
V0614 00:48:38.499000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.500000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_20 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_1, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_4 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_20, 2), kwargs = {})
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.502000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_13 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, 1), kwargs = {})
V0614 00:48:38.502000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_21 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_13, [12, 64, 64]), kwargs = {})
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_5 : [num_users=10] = call_function[target=torch.ops.aten.reshape.default](args = (%arg14_1, [1, 12, 13, 64, -1]), kwargs = {})
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.504000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_3 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, 0), kwargs = {})
V0614 00:48:38.504000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.505000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_4 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, 1), kwargs = {})
V0614 00:48:38.505000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.506000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_5 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, 2), kwargs = {})
V0614 00:48:38.506000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.507000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_6 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, -1), kwargs = {})
V0614 00:48:38.507000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.508000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_2 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%view_5,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.508000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function clone at 0x7fe2179aac20>
V0614 00:48:38.510000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_8 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_2, [156, 64, 64]), kwargs = {})
V0614 00:48:38.510000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.511000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%arg0_1, %arg1_1, %arg2_1, %arg3_1, %arg4_1, %arg5_1, %arg6_1, %arg7_1, %arg8_1, %arg9_1, %arg10_1, %arg11_1],), kwargs = {})
V0614 00:48:38.511000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.514000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [12, 11, 3]), kwargs = {})
V0614 00:48:38.514000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.515000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.int64), kwargs = {})
V0614 00:48:38.515000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.516000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze : [num_users=3] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%convert_element_type, 0), kwargs = {})
V0614 00:48:38.516000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.517000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_7 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%unsqueeze, [-1]), kwargs = {})
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %iota : [num_users=1] = call_function[target=torch.ops.prims.iota.default](args = (396,), kwargs = {start: 0, step: 1, dtype: torch.int64, device: cpu, requires_grad: False})
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function iota at 0x7fe2179aae60>
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor_mode](args = (%iota, 33), kwargs = {rounding_mode: floor})
V0614 00:48:38.519000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function div_mode at 0x7fe2165652d0>
V0614 00:48:38.519000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%div, 13), kwargs = {})
V0614 00:48:38.519000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.520000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_7, %mul_1), kwargs = {})
V0614 00:48:38.520000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.521000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %index_1 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_8, [%add]), kwargs = {})
V0614 00:48:38.521000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function index at 0x7fe21799ca60>
V0614 00:48:38.523000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_9 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_1, [1, 12, 33, 64, 64]), kwargs = {})
V0614 00:48:38.524000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.525000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_10 : [num_users=3] = call_function[target=torch.ops.aten.reshape.default](args = (%view_9, [1, 12, 11, 192, -1]), kwargs = {})
V0614 00:48:38.525000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.534000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_7 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_10, 2, 0), kwargs = {})
V0614 00:48:38.534000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.540000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_1 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_3, %select_4, %select_5, %select_6, %select_7], 2), kwargs = {})
V0614 00:48:38.540000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.541000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_22 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.542000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.542000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_3 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_22, [0, 2, 1]), kwargs = {})
V0614 00:48:38.542000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.543000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_2 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_21, %permute_3), kwargs = {})
V0614 00:48:38.543000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.544000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.544000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_23 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_2, [1, 12, 64, 448]), kwargs = {})
V0614 00:48:38.544000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.545000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_23, 0.125), kwargs = {})
V0614 00:48:38.545000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.546000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_30 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, 0, 192), kwargs = {})
V0614 00:48:38.546000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.547000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_34 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, -64, 9223372036854775807), kwargs = {})
V0614 00:48:38.547000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %full_default : [num_users=2] = call_function[target=torch.ops.aten.full.default](args = ([1, 1, 1, 192], 1), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu, pin_memory: False})
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function full at 0x7fe21799c550>
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_3 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_30, %slice_34, %full_default], 3), kwargs = {})
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.549000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %full_default_1 : [num_users=2] = call_function[target=torch.ops.aten.full.default](args = ([1, 12, 64, 256], 1), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu, pin_memory: False})
V0614 00:48:38.549000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function full at 0x7fe21799c550>
V0614 00:48:38.550000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg13_1, 1, 1, -1), kwargs = {})
V0614 00:48:38.550000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_1 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_2, 3), kwargs = {})
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_2 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_1, 4), kwargs = {})
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_2, [0, 3, 1, 2, 4]), kwargs = {})
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%arg13_1, 0, 0), kwargs = {})
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.553000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_1 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%unsqueeze, 0, 0), kwargs = {})
V0614 00:48:38.553000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_1 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_1, [396]), kwargs = {})
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %index : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%select, [%view_1]), kwargs = {})
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function index at 0x7fe21799ca60>
V0614 00:48:38.555000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_2 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index, [1, 396, 64]), kwargs = {})
V0614 00:48:38.555000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.556000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_3 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_2, [1, 12, 11, 192]), kwargs = {})
V0614 00:48:38.556000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.559000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_3 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_3, 4), kwargs = {})
V0614 00:48:38.559000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.559000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_3, [0, 1, 2, 4, 3]), kwargs = {})
V0614 00:48:38.560000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.560000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul : [num_users=3] = call_function[target=torch.ops.aten.mul.Tensor](args = (%permute, %permute_1), kwargs = {})
V0614 00:48:38.560000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.566000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_14 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%mul, 2, 0), kwargs = {})
V0614 00:48:38.566000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.568000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_4 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%full_default_1, %select_14], 3), kwargs = {})
V0614 00:48:38.568000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.569000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %minimum : [num_users=1] = call_function[target=torch.ops.aten.minimum.default](args = (%cat_3, %cat_4), kwargs = {})
V0614 00:48:38.570000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216577130>
V0614 00:48:38.571000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_2 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %minimum), kwargs = {})
V0614 00:48:38.571000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.572000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_2, -10000.0), kwargs = {})
V0614 00:48:38.572000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.573000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_3 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_5, %mul_6), kwargs = {})
V0614 00:48:38.573000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.575000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_1 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_3, [-1], True), kwargs = {})
V0614 00:48:38.575000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.576000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_3 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_3, %amax_1), kwargs = {})
V0614 00:48:38.576000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.578000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_1 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_3,), kwargs = {})
V0614 00:48:38.579000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.581000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_1, [-1], True), kwargs = {})
V0614 00:48:38.581000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.582000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_3 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_1, %sum_2), kwargs = {})
V0614 00:48:38.582000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function div at 0x7fe2165656c0>
V0614 00:48:38.584000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_12 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_3, torch.bfloat16), kwargs = {})
V0614 00:48:38.584000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_24 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_12, [-1, 64, 448]), kwargs = {})
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_6 : [num_users=10] = call_function[target=torch.ops.aten.reshape.default](args = (%arg15_1, [1, 12, 13, 64, -1]), kwargs = {})
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.586000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_8 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, 0), kwargs = {})
V0614 00:48:38.586000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.587000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_9 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, 1), kwargs = {})
V0614 00:48:38.587000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.588000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_10 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, 2), kwargs = {})
V0614 00:48:38.588000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.589000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_11 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, -1), kwargs = {})
V0614 00:48:38.589000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.590000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_3 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%view_6,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.590000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function clone at 0x7fe2179aac20>
V0614 00:48:38.591000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_12 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_3, [156, 64, 64]), kwargs = {})
V0614 00:48:38.591000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.591000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %index_2 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_12, [%add]), kwargs = {})
V0614 00:48:38.592000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function index at 0x7fe21799ca60>
V0614 00:48:38.593000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_13 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_2, [1, 12, 33, 64, 64]), kwargs = {})
V0614 00:48:38.593000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.594000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_14 : [num_users=3] = call_function[target=torch.ops.aten.reshape.default](args = (%view_13, [1, 12, 11, 192, -1]), kwargs = {})
V0614 00:48:38.594000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.595000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_12 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_14, 2, 0), kwargs = {})
V0614 00:48:38.595000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.597000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_2 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_8, %select_9, %select_10, %select_11, %select_12], 2), kwargs = {})
V0614 00:48:38.597000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.598000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_25 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_2, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.598000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.599000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_3 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_24, %view_25), kwargs = {})
V0614 00:48:38.599000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.600000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_26 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_3, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_5 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_26, 2), kwargs = {})
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.601000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_57 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_4, 2, 2, -2), kwargs = {})
V0614 00:48:38.601000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.602000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_6 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_57, 5), kwargs = {})
V0614 00:48:38.602000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_6 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_6, [0, 1, 2, 3, 5, 4]), kwargs = {})
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_8 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_6, [1, 2, 3, 5, 0, 4]), kwargs = {})
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_33 : [num_users=2] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_8, [12, 576, 64]), kwargs = {})
V0614 00:48:38.604000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.605000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_7 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_3, 4), kwargs = {})
V0614 00:48:38.605000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_8 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_7, 5), kwargs = {})
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_7 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_8, [0, 1, 4, 5, 2, 3]), kwargs = {})
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_9 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_7, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_34 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_9, [12, 64, 64]), kwargs = {})
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.608000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_6 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_33, %view_34), kwargs = {})
V0614 00:48:38.608000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.620000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.620000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_35 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_6, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.620000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_10 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_35, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_36 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_10, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.622000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_36, 0.125), kwargs = {})
V0614 00:48:38.622000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.623000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_68 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, 0, 64), kwargs = {})
V0614 00:48:38.623000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_12 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_68, 3), kwargs = {})
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_5 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %unsqueeze_12), kwargs = {})
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.625000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_5, -10000.0), kwargs = {})
V0614 00:48:38.625000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.626000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_5 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_9, %mul_12), kwargs = {})
V0614 00:48:38.626000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.627000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_24 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_5, torch.bfloat16), kwargs = {})
V0614 00:48:38.627000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.628000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_4 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%slice_57,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.628000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function clone at 0x7fe2179aac20>
V0614 00:48:38.630000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_27 : [num_users=2] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_4, [108, 64, 64]), kwargs = {})
V0614 00:48:38.630000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.631000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_39 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_5, 2, 1, -3), kwargs = {})
V0614 00:48:38.631000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.632000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_42 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_5, 2, 2, -2), kwargs = {})
V0614 00:48:38.632000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.633000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_45 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_5, 2, 3, -1), kwargs = {})
V0614 00:48:38.633000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.634000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_5 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_39, %slice_42, %slice_45], 3), kwargs = {})
V0614 00:48:38.634000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.635000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_28 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_5, [-1, 192, 64]), kwargs = {})
V0614 00:48:38.635000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_4 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_28, [0, 2, 1]), kwargs = {})
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_4 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_27, %permute_4), kwargs = {})
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.646000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.646000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_29 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_4, [1, 12, 9, 64, 192]), kwargs = {})
V0614 00:48:38.646000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.647000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_7 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_29, 0.125), kwargs = {})
V0614 00:48:38.647000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.648000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_4 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %arg17_1), kwargs = {})
V0614 00:48:38.648000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.649000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_4, -10000.0), kwargs = {})
V0614 00:48:38.649000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.650000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_4 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_7, %mul_11), kwargs = {})
V0614 00:48:38.650000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.652000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_23 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_4, torch.bfloat16), kwargs = {})
V0614 00:48:38.652000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.654000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_60 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_10, 2, 1, -1), kwargs = {})
V0614 00:48:38.654000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.660000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_6 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%slice_60,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.660000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function clone at 0x7fe2179aac20>
V0614 00:48:38.669000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_31 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_6, [108, 192, 64]), kwargs = {})
V0614 00:48:38.670000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.677000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_5 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_31, [0, 2, 1]), kwargs = {})
V0614 00:48:38.677000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.684000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_5 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_27, %permute_5), kwargs = {})
V0614 00:48:38.684000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.693000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_32 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_5, [1, 12, 9, 64, 192]), kwargs = {})
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_8 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_32, 0.125), kwargs = {})
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.694000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_75 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%mul, 2, 1, -1), kwargs = {})
V0614 00:48:38.694000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.697000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_7 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %slice_75), kwargs = {})
V0614 00:48:38.697000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.701000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_7, -10000.0), kwargs = {})
V0614 00:48:38.701000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.702000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_7 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_8, %mul_14), kwargs = {})
V0614 00:48:38.702000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.705000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_26 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_7, torch.bfloat16), kwargs = {})
V0614 00:48:38.705000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.706000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_10 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_6, 4), kwargs = {})
V0614 00:48:38.706000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.707000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_11 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_10, 5), kwargs = {})
V0614 00:48:38.707000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_12 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_11, [0, 1, 4, 5, 2, 3]), kwargs = {})
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_14 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_12, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.709000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_38 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_14, [12, 64, 64]), kwargs = {})
V0614 00:48:38.709000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.710000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_7 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_33, %view_38), kwargs = {})
V0614 00:48:38.710000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.711000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.711000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_39 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_7, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.711000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.711000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_15 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_39, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_40 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_15, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_40, 0.125), kwargs = {})
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.713000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_13 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_34, 3), kwargs = {})
V0614 00:48:38.713000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.714000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_6 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %unsqueeze_13), kwargs = {})
V0614 00:48:38.714000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.715000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_6, -10000.0), kwargs = {})
V0614 00:48:38.715000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.715000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_6 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_10, %mul_13), kwargs = {})
V0614 00:48:38.716000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.717000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_25 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_6, torch.bfloat16), kwargs = {})
V0614 00:48:38.717000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.718000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_7 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_24, %convert_element_type_23, %convert_element_type_26, %convert_element_type_25], -1), kwargs = {})
V0614 00:48:38.718000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.720000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_27 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat_7, torch.float32), kwargs = {})
V0614 00:48:38.721000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.722000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_2 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%convert_element_type_27, [-1], True), kwargs = {})
V0614 00:48:38.722000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.724000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_8 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_27, %amax_2), kwargs = {})
V0614 00:48:38.724000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.727000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_2 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_8,), kwargs = {})
V0614 00:48:38.727000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.728000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_2, [-1], True), kwargs = {})
V0614 00:48:38.728000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.729000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_4 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_2, %sum_3), kwargs = {})
V0614 00:48:38.729000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function div at 0x7fe2165656c0>
V0614 00:48:38.731000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_28 : [num_users=4] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_4, torch.bfloat16), kwargs = {})
V0614 00:48:38.731000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.732000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_80 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, 64, 256), kwargs = {})
V0614 00:48:38.732000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.733000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_41 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%slice_80, [108, 64, 192]), kwargs = {})
V0614 00:48:38.734000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.735000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_48 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_6, 2, 1, -3), kwargs = {})
V0614 00:48:38.735000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.736000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_51 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_6, 2, 2, -2), kwargs = {})
V0614 00:48:38.736000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.736000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_54 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_6, 2, 3, -1), kwargs = {})
V0614 00:48:38.737000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.737000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_6 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_48, %slice_51, %slice_54], 3), kwargs = {})
V0614 00:48:38.737000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.738000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_42 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_6, [-1, 192, 64]), kwargs = {})
V0614 00:48:38.738000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.739000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_8 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_41, %view_42), kwargs = {})
V0614 00:48:38.739000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.748000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_43 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_8, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_85 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, 256, -64), kwargs = {})
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.749000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_44 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%slice_85, [108, 64, 192]), kwargs = {})
V0614 00:48:38.749000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.750000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_88 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_14, 2, 1, -1), kwargs = {})
V0614 00:48:38.750000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.751000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_7 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%slice_88,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.751000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function clone at 0x7fe2179aac20>
V0614 00:48:38.754000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_45 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_7, [108, 192, 64]), kwargs = {})
V0614 00:48:38.754000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.755000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_9 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_44, %view_45), kwargs = {})
V0614 00:48:38.755000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.767000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.767000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_46 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_9, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.767000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.768000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_43, %view_46), kwargs = {})
V0614 00:48:38.768000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.769000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_47 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%add_8, [108, 64, 64]), kwargs = {})
V0614 00:48:38.769000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.770000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_48 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_47, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.770000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.772000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_93 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, 0, 64), kwargs = {})
V0614 00:48:38.772000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.773000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_14 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_93, 5), kwargs = {})
V0614 00:48:38.773000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_16 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_14, [0, 1, 2, 3, 5, 4]), kwargs = {})
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_18 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_16, [1, 2, 3, 5, 0, 4]), kwargs = {})
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_49 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_18, [12, 576, 64]), kwargs = {})
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.775000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_15 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_8, 4), kwargs = {})
V0614 00:48:38.775000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.776000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_16 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_15, 5), kwargs = {})
V0614 00:48:38.776000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_17 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_16, [0, 1, 4, 5, 3, 2]), kwargs = {})
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_19 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_17, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_50 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_19, [12, 64, 64]), kwargs = {})
V0614 00:48:38.778000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.778000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_10 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_49, %view_50), kwargs = {})
V0614 00:48:38.778000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.785000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.785000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_51 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_10, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.785000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_20 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_51, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_52 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_20, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_9 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_48, %view_52), kwargs = {})
V0614 00:48:38.787000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.790000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_53 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%add_9, [108, 64, 64]), kwargs = {})
V0614 00:48:38.790000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.792000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_54 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_53, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.792000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.795000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_100 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, -64, 9223372036854775807), kwargs = {})
V0614 00:48:38.795000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.796000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_17 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_100, 5), kwargs = {})
V0614 00:48:38.796000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_21 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_17, [0, 1, 2, 3, 5, 4]), kwargs = {})
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_23 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_21, [1, 2, 3, 5, 0, 4]), kwargs = {})
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_55 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_23, [12, 576, 64]), kwargs = {})
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_18 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_11, 4), kwargs = {})
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.799000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_19 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_18, 5), kwargs = {})
V0614 00:48:38.799000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_22 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_19, [0, 1, 4, 5, 3, 2]), kwargs = {})
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_24 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_22, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_56 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_24, [12, 64, 64]), kwargs = {})
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_11 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_55, %view_56), kwargs = {})
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.808000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.808000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_57 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_11, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.808000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_25 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_57, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_58 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_25, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_10 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_54, %view_58), kwargs = {})
V0614 00:48:38.810000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.814000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_59 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%add_10, [108, 64, 64]), kwargs = {})
V0614 00:48:38.814000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.818000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_60 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_59, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.818000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.822000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_29 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, -2), kwargs = {})
V0614 00:48:38.822000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_61 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_29, [12, 64, 64]), kwargs = {})
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_20 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, -3), kwargs = {})
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.825000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_21 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, -2), kwargs = {})
V0614 00:48:38.826000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.826000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_23 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_10, 2, -1), kwargs = {})
V0614 00:48:38.827000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.832000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_8 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_3, %select_20, %select_21, %select_6, %select_23], 2), kwargs = {})
V0614 00:48:38.832000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.833000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_62 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_8, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.833000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_26 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_62, [0, 2, 1]), kwargs = {})
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_12 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_61, %permute_26), kwargs = {})
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.835000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.835000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_63 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_12, [1, 12, 64, 448]), kwargs = {})
V0614 00:48:38.835000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.836000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_63, 0.125), kwargs = {})
V0614 00:48:38.836000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.837000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_132 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, -192, 9223372036854775807), kwargs = {})
V0614 00:48:38.837000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.838000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_10 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_68, %slice_132, %full_default], 3), kwargs = {})
V0614 00:48:38.838000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.839000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_30 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%mul, 2, -1), kwargs = {})
V0614 00:48:38.839000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.842000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_11 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%full_default_1, %select_30], 3), kwargs = {})
V0614 00:48:38.842000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.843000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %minimum_1 : [num_users=1] = call_function[target=torch.ops.aten.minimum.default](args = (%cat_10, %cat_11), kwargs = {})
V0614 00:48:38.843000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216577130>
V0614 00:48:38.844000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_9 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %minimum_1), kwargs = {})
V0614 00:48:38.844000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.845000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_16 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_9, -10000.0), kwargs = {})
V0614 00:48:38.845000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.846000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_11 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_15, %mul_16), kwargs = {})
V0614 00:48:38.846000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.848000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_3 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_11, [-1], True), kwargs = {})
V0614 00:48:38.848000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.849000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_10 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_11, %amax_3), kwargs = {})
V0614 00:48:38.849000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.852000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_3 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_10,), kwargs = {})
V0614 00:48:38.852000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.854000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_3, [-1], True), kwargs = {})
V0614 00:48:38.854000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.855000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_5 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_3, %sum_4), kwargs = {})
V0614 00:48:38.855000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function div at 0x7fe2165656c0>
V0614 00:48:38.857000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_41 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_5, torch.bfloat16), kwargs = {})
V0614 00:48:38.857000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.858000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_64 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_41, [-1, 64, 448]), kwargs = {})
V0614 00:48:38.858000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.859000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_25 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, -3), kwargs = {})
V0614 00:48:38.859000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.860000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_26 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, -2), kwargs = {})
V0614 00:48:38.860000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.861000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_28 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_14, 2, -1), kwargs = {})
V0614 00:48:38.861000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.862000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_9 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_8, %select_25, %select_26, %select_11, %select_28], 2), kwargs = {})
V0614 00:48:38.862000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.863000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_65 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_9, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.863000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.864000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_13 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_64, %view_65), kwargs = {})
V0614 00:48:38.864000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.864000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.864000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_66 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_13, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_20 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_66, 2), kwargs = {})
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_31 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, -1), kwargs = {})
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function select at 0x7fe2179c4550>
V0614 00:48:38.866000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_67 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_31, [12, 64, 64]), kwargs = {})
V0614 00:48:38.867000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.867000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_14 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_67, %permute_2), kwargs = {})
V0614 00:48:38.867000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.868000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.868000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_69 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_14, [1, 12, 64, 832]), kwargs = {})
V0614 00:48:38.868000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.869000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_69, 0.125), kwargs = {})
V0614 00:48:38.869000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.870000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_12 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_17, %mul_4), kwargs = {})
V0614 00:48:38.870000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.872000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_4 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_12, [-1], True), kwargs = {})
V0614 00:48:38.872000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.872000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_12 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_12, %amax_4), kwargs = {})
V0614 00:48:38.873000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.875000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_4 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_12,), kwargs = {})
V0614 00:48:38.875000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.876000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_4, [-1], True), kwargs = {})
V0614 00:48:38.877000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.878000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_6 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_4, %sum_5), kwargs = {})
V0614 00:48:38.878000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function div at 0x7fe2165656c0>
V0614 00:48:38.879000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_48 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_6, torch.bfloat16), kwargs = {})
V0614 00:48:38.879000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.880000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_70 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_48, [-1, 64, 832]), kwargs = {})
V0614 00:48:38.880000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.881000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_15 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_70, %view_19), kwargs = {})
V0614 00:48:38.881000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.882000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.882000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_72 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_15, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.882000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_21 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_72, 2), kwargs = {})
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_12 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5, %view_60, %unsqueeze_20, %unsqueeze_21], 2), kwargs = {})
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.885000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_73 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_12, [1, 12, 832, -1]), kwargs = {})
V0614 00:48:38.885000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function view at 0x7fe2179c3130>
V0614 00:48:38.885000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_19 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_73, %arg18_1), kwargs = {})
V0614 00:48:38.886000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function mul at 0x7fe2165653f0>
V0614 00:48:38.887000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_28 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mul_19, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:38.887000 140616046391680 torch/_inductor/graph.py:976] [15/0]   via <function permute at 0x7fe2179c3370>
V0614 00:48:38.889000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering return (permute_28, unsqueeze)
V0614 00:48:38.894000 140616046391680 torch/_inductor/graph.py:1097] [15/0] Force channels last inputs for 0 conv for the current graph with id 8
W0614 00:48:39.040000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] indirect0 is not in var_ranges, defaulting to unknown range.
W0614 00:48:39.043000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] indirect0 is not in var_ranges, defaulting to unknown range.
W0614 00:48:40.026000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] q0 is not in var_ranges, defaulting to unknown range.
W0614 00:48:40.026000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] q1 is not in var_ranges, defaulting to unknown range.
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf0,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1]),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm}
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[64, 1, 768]),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm}
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm}
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf1', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf0, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, r0)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp9
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[832],
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=max,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=amax,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_4, amax, mul_3, add_2, sub}
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf2', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf0, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, i3)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.load(buf1, i2 + 64 * i1)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp9 - tmp10
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp12 = ops.exp(tmp11)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp12
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 832],
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=exp,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={add_2, sub_1, mul_4, mul_3, exp, sub}
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf3', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf2, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[832],
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=sum,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=sum_1,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sum_1}
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf4', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf2, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf3, i2 + 64 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 / tmp1
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 832],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_5,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={convert_element_type_5, div_2}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf5,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf4', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf2, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf3, i2 + 64 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 832],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_5,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={convert_element_type_5, div_2}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_1}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 832, 64], stride=[64, 768, 1]),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_1}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_1,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_1}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg0_1, i1 + 3 * i0)
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg1_1, i1 + 3 * i0)
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf8', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg2_1, i1 + 3 * i0)
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf9', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg3_1, i1 + 3 * i0)
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf10', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg4_1, i1 + 3 * i0)
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf11', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg5_1, i1 + 3 * i0)
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf12', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg6_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf13', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg7_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf14', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg8_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf15', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg9_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf16', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg10_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf17', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg11_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf18', layout=FixedLayout('cpu', torch.int32, size=[132, 3], stride=[3, 1]), inputs=[ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg0_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg1_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf8', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg2_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf9', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg3_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf10', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg4_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf11', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg5_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf12', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg6_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf13', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg7_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf14', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg8_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf15', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg9_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf16', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg10_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf17', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg11_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf19', layout=FixedLayout('cpu', torch.int64, size=[12, 11, 3], stride=[33, 3, 1]), data=Pointwise(
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.int64,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       i0, i1, i2 = index
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf18, i2 + 3 * i1 + 33 * i0)
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.to_dtype(tmp0, torch.int64, src_dtype=torch.int32)
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp1
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[12, 11, 3],
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={convert_element_type}
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf20', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf21', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf22', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf23', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf24', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf25', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf20', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf21', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf22', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf23', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf24', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf26,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=49152),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_2}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ConcatKernel(name='buf25', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf20', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf21', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf22', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf23', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf24', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = tmp1 // tmp2
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp5 = tmp3 * tmp4
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp6 = tmp0 + tmp5
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp7
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 192, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))])
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 1, 64]),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_2}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_2,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_2}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf27', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_3}
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf28', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, 768 + i3)
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 64],
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_3}
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf29', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=full_default,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={full_default}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf30', layout=FixedLayout('cpu', torch.float32, size=[1, 1, 1, 448], stride=[448, 448, 448, 1]), inputs=[ComputedBuffer(name='buf27', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_3}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf28', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, 768 + i3)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 64],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_3}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf29', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=full_default,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={full_default}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf31', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 256],
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=full_default_1,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={full_default_1}
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf32', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg13_1, 64 + i2)
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf19, ModularIndexing(i3 + 2112 * i1, 64, 396))
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp0 * tmp2
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 192],
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_4}
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf33', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), inputs=[ComputedBuffer(name='buf31', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 256],
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=full_default_1,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={full_default_1}
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf32', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg13_1, 64 + i2)
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf19, ModularIndexing(i3 + 2112 * i1, 64, 396))
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp0 * tmp2
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 192],
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_4}
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf34', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf26, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(buf30, r0)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = ops.load(buf33, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp4 - tmp7
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = tmp8 * tmp9
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp3 + tmp10
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp11
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[448],
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=max,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=amax_1,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_6, sub_2, minimum, amax_1, mul_5, add_3}
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf35', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf26, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(buf30, i3)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = ops.load(buf33, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp4 - tmp7
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = tmp8 * tmp9
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp3 + tmp10
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp12 = ops.load(buf34, i2 + 64 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp13 = tmp11 - tmp12
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp14 = ops.exp(tmp13)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp14
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 448],
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=exp_1,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_6, sub_2, minimum, exp_1, mul_5, add_3, sub_3}
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf36', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf35, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[448],
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=sum,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=sum_2,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sum_2}
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf37', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf38', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf39', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf40', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf41', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf42', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf37', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf38', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf39', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf40', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf41', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf43', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf35, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf36, i2 + 64 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 / tmp1
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 448],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_12,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={convert_element_type_12, div_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf44,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf43', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf35, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf36, i2 + 64 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 448],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_12,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={convert_element_type_12, div_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ConcatKernel(name='buf42', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf37', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf38', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf39', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf40', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf41', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = tmp1 // tmp2
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp5 = tmp3 * tmp4
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp6 = tmp0 + tmp5
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp7
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 192, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))])
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 448, 64], stride=[28672, 64, 1]),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_3,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf45,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[64, 768, 1], offset=98304),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_6}
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 1, 768]),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_6}
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_6,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_6}
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf46', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_5}
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf47', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_5}
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf48', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_5}
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf49', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf46', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_5}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf47', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_5}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf48', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_5}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf50', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[442368, 36864, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg12_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=clone_4,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={clone_4}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf51,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 192, 1]),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf50', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[442368, 36864, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg12_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=clone_4,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={clone_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ConcatKernel(name='buf49', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf46', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_5}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf47', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_5}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf48', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_5}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))])
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 1, 64]),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_4,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf52', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=clone_6,
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={clone_6}
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf53,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 192, 1]),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf50', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[442368, 36864, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg12_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=clone_4,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={clone_4}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_5}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf52', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = tmp1 // tmp2
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp5 = tmp3 * tmp4
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp6 = tmp0 + tmp5
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp7
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=clone_6,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={clone_6}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 1, 64]),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_5}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_5,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_5}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf54,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[64, 768, 1], offset=98304),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_7}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 1, 768], offset=589824),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_7}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_7,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_7}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf55', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf45, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, i4)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp10
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_24,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sub_5, mul_12, mul_9, convert_element_type_24, add_5}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf56', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf51, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg17_1, i4 + 192 * i3 + 12288 * i2)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp10
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_23,
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_11, convert_element_type_23, sub_4, mul_7, add_4}
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf57', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf53, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg13_1, 128 + i3 + 64 * i2)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = ops.load(buf19, ModularIndexing(192 + i4 + 192 * i2 + 2112 * i1, 64, 396))
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg13_1, 64 * tmp6 + ModularIndexing(i4, 1, 64))
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp5 * tmp7
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp4 - tmp8
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp9 * tmp10
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp12 = tmp3 + tmp11
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp13 = ops.to_dtype(tmp12, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp13
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_26,
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sub_7, convert_element_type_26, mul_8, add_7, mul_14}
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf58', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf54, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, 768 + i4)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp10
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_25,
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_10, add_6, sub_6, mul_13, convert_element_type_25}
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf59', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), inputs=[ComputedBuffer(name='buf55', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf45, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, i4)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_24,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sub_5, mul_12, mul_9, convert_element_type_24, add_5}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf56', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf51, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg17_1, i4 + 192 * i3 + 12288 * i2)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_23,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_11, convert_element_type_23, sub_4, mul_7, add_4}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf57', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf53, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg13_1, 128 + i3 + 64 * i2)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = ops.load(buf19, ModularIndexing(192 + i4 + 192 * i2 + 2112 * i1, 64, 396))
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg13_1, 64 * tmp6 + ModularIndexing(i4, 1, 64))
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp5 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp4 - tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp9 * tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp12 = tmp3 + tmp11
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp13 = ops.to_dtype(tmp12, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp13
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_26,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sub_7, convert_element_type_26, mul_8, add_7, mul_14}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf58', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf54, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, 768 + i4)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_25,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_10, add_6, sub_6, mul_13, convert_element_type_25}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf60', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 9, 64, 1], stride=[6912, 576, 64, 1, 6912]), data=Reduction(
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, _ = index
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf59, r0 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp1
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 1],
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[512],
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=max,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=amax_2,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={amax_2, convert_element_type_27}
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf61', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf59, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.load(buf60, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 - tmp2
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.exp(tmp3)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp4
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=exp_2,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={exp_2, sub_8, convert_element_type_27}
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf62', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 9, 64, 1], stride=[6912, 576, 64, 1, 6912]), data=Reduction(
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, _ = index
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf61, r0 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 1],
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[512],
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=sum,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=sum_3,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sum_3}
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf63', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_6}
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf64', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_6}
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf65', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf66', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf63', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf64', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf65', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 / tmp1
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_28,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={div_4, convert_element_type_28}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf68,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_28,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={div_4, convert_element_type_28}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[32768, 512, 1], offset=64),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_8}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ConcatKernel(name='buf66', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf63', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_6}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf64', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_6}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf65', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_6}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))])
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 192, 64], stride=[12288, 64, 1]),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_8}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_8,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_8}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf69', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=clone_7,
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={clone_7}
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf70,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_28,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={div_4, convert_element_type_28}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[32768, 512, 1], offset=256),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_9}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf69', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = tmp1 // tmp2
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp5 = tmp3 * tmp4
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp6 = tmp0 + tmp5
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp7
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=clone_7,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={clone_7}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[108, 192, 64], stride=[12288, 64, 1]),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_9}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_9,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_9}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf71,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_28,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={div_4, convert_element_type_28}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[294912, 512, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_10}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_10}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_10,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_10}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf72,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3, i4 = index
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_28,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={div_4, convert_element_type_28}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[294912, 512, 1], offset=448),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_11}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=589824),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_11}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_11,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_11}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf73', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf74', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf75', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf76', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf77', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf78', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf73', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf74', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf75', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf76', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf77', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf79,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=540672),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_12}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ConcatKernel(name='buf78', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf73', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf74', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf75', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf76', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf77', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = tmp1 // tmp2
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp5 = tmp3 * tmp4
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp6 = tmp0 + tmp5
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp7
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 192, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))])
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 1, 64]),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_12}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_12,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_12}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf80', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 64],
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_10}
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf81', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, 640 + i3)
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_10}
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf82', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf83', layout=FixedLayout('cpu', torch.float32, size=[1, 1, 1, 448], stride=[448, 448, 448, 1]), inputs=[ComputedBuffer(name='buf80', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 64],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf81', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg16_1, 640 + i3)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf82', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 1, 1, 192],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf84', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 256],
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_11}
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf85', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg13_1, 704 + i2)
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf19, ModularIndexing(1920 + i3 + 2112 * i1, 64, 396))
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp0 * tmp2
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 192],
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_11}
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf86', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), inputs=[ComputedBuffer(name='buf84', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 256],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_11}
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf85', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg13_1, 704 + i2)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf19, ModularIndexing(1920 + i3 + 2112 * i1, 64, 396))
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp0 * tmp2
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 192],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_11}
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf87', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf79, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(buf83, r0)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = ops.load(buf86, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp4 - tmp7
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = tmp8 * tmp9
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp3 + tmp10
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp11
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[448],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=max,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=amax_3,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_16, sub_9, mul_15, add_11, amax_3, minimum_1}
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf88', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf79, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(buf83, i3)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = ops.load(buf86, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp4 - tmp7
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = tmp8 * tmp9
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp3 + tmp10
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp12 = ops.load(buf87, i2 + 64 * i1)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp13 = tmp11 - tmp12
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp14 = ops.exp(tmp13)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp14
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 448],
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=exp_3,
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={exp_3, mul_16, sub_9, mul_15, sub_10, add_11, minimu...
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf89', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf88, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[448],
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=sum,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=sum_4,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sum_4}
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf90', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf91', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf92', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf93', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf94', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf95', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf90', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf91', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf92', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf93', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf94', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = tmp1 // tmp2
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = tmp3 * tmp4
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp0 + tmp5
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp7
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 192, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf96', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf88, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf89, i2 + 64 * i1)
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 / tmp1
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 448],
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_41,
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={div_5, convert_element_type_41}
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf97,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf96', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf88, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf89, i2 + 64 * i1)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 448],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_41,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={div_5, convert_element_type_41}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_13}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ConcatKernel(name='buf95', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf90', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf91', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf92', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf93', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       )), ComputedBuffer(name='buf94', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = tmp1 // tmp2
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp5 = tmp3 * tmp4
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp6 = tmp0 + tmp5
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp7
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 192, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))])
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 448, 64], stride=[28672, 64, 1]),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_13}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_13,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_13}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf98,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=589824),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_14}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[64, 1, 768]),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_14}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_14,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_14}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf99', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf98, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, r0)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp9
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[832],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=max,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=amax_4,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={mul_17, add_12, mul_4, amax_4, sub}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf100', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf98, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 * tmp1
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(arg16_1, i3)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 - tmp5
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp8 = tmp6 * tmp7
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp9 = tmp3 + tmp8
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp10 = ops.load(buf99, i2 + 64 * i1)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp11 = tmp9 - tmp10
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp12 = ops.exp(tmp11)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp12
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 832],
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=exp_4,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={exp_4, mul_17, add_12, mul_4, sub, sub_12}
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf101', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.float32,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index, rindex):
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, _ = index
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       r0 = rindex
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf100, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 1],
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_ranges=[832],
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   reduction_type=sum,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=sum_5,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={sum_5}
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf102', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3 = index
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf100, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf101, i2 + 64 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 / tmp1
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp3
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 64, 832],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=convert_element_type_48,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={div_6, convert_element_type_48}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   name=buf103,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   inputs=[ReinterpretView(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ComputedBuffer(name='buf102', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         'cpu',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         torch.bfloat16,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         def inner_fn(index):
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             _, i1, i2, i3 = index
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp0 = ops.load(buf100, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp1 = ops.load(buf101, i2 + 64 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp2 = tmp0 / tmp1
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]             return tmp3
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         ranges=[1, 12, 64, 832],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origin_node=convert_element_type_48,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]         origins={div_6, convert_element_type_48}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       ))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_15}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ), ReinterpretView(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     StorageBox(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     ),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     FixedLayout('cpu', torch.bfloat16, size=[12, 832, 64], stride=[64, 768, 1]),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]     origins={bmm_15}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   )],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   constant_args=(),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwargs={},
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   output_view=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   cpp_kernel_name=at::bmm_out,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   op_overload=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   arg_properties=[{}, {}],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   kwarg_properties=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   unbacked_bindings={},
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=bmm_15,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={bmm_15}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf104', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 1, 64, 64], stride=[638976, 53248, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, _, i3, i4 = index
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf5, i4 + 64 * i3 + 4096 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 1, 64, 64],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_12}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf105', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 1, 64, 64], stride=[638976, 53248, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, _, i3, i4 = index
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf44, i4 + 64 * i3 + 4096 * i1)
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp0
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   ranges=[1, 12, 1, 64, 64],
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origin_node=None,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   origins={cat_12}
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf106', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[638976, 53248, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   'cpu',
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   torch.bfloat16,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]   def inner_fn(index):
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       _, i1, i2, i3, i4 = index
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp0 = ops.load(buf68, i4 + 64 * i3 + 4096 * ModularIndexing(ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 1, 9) + 36864 * ModularIndexing(9 * ModularIndexing(9 * ModularIndexing(i2 + 9 * i1, 9, 12) + ModularIndexing(i2, 1, 9), 9, 12) + ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp1 = ops.load(buf70, i4 + 64 * i3 + 4096 * ModularIndexing(ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 1, 9) + 36864 * ModularIndexing(9 * ModularIndexing(9 * ModularIndexing(i2 + 9 * i1, 9, 12) + ModularIndexing(i2, 1, 9), 9, 12) + ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp2 = tmp0 + tmp1
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp3 = ops.load(buf71, i4 + 64 * i3 + 4096 * ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9) + 36864 * ModularIndexing(9 * ModularIndexing(i2 + 9 * i1, 9, 12) + ModularIndexing(i2, 1, 9), 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp4 = tmp2 + tmp3
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp5 = ops.load(buf72, i4 + 64 * i3 + 4096 * ModularIndexing(i2, 1, 9) + 36864 * ModularIndexing(i2 + 9 * i1, 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       tmp6 = tmp4 + tmp5
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0]       return tmp6
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:160

## before_regression.log

      
    Raw
  

              before_regression.log
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)