ezyang/gist:4379d89528bd61a0fdb31062e402fb63 Secret

## gistfile0.txt
Running torchbench.py BERT_pytorch...
ERROR:common:
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/bert.py", line 43, in forward
    x = self.embedding(x, segment_info)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/bert.py", line 32, in forward
    x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/bert.py", line 32, in <graph break in forward>
    x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 122, in compile_fx_inner
    compiled_fn = graph.compile_to_fn()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 349, in compile_to_fn
    return self.compile_to_module().call
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 335, in compile_to_module
    code = self.codegen()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 326, in codegen
    self.wrapper_code = WrapperCodeGen()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/codegen/wrapper.py", line 240, in __init__
    V.graph.sizevars.codegen(self.prefix, V.graph.graph_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/sizevars.py", line 481, in codegen
    assert not needed
AssertionError
TorchDynamo optimized model failed to run because of following error
cuda train BERT_pytorch                       FAIL
Running torchbench.py Background_Matting...
[2022-11-06 02:49:25,066] torch._inductor.graph: [WARNING] Creating implicit fallback for:
  target: <built-in function sub>
  args[0]: 128
  args[1]: 1
ERROR:common:TypeError: sub expected 2 arguments, got 0
  target: <built-in function sub>
  args[0]: 128
  args[1]: 1

While executing %sub_36 : [#users=1] = call_function[target=operator.sub](args = (%sym_size, 1), kwargs = {})
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
TypeError: sub expected 2 arguments, got 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/Background_Matting/networks.py", line 91, in forward
    def forward(self, image,back,seg,multi):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: TypeError: sub expected 2 arguments, got 0
  target: <built-in function sub>
  args[0]: 128
  args[1]: 1

While executing %sub_36 : [#users=1] = call_function[target=operator.sub](args = (%sym_size, 1), kwargs = {})
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train Background_Matting                 FAIL

sampling loop time step:   0%|          | 0/1 [00:00<?, ?it/s]
sampling loop time step: 100%|██████████| 1/1 [00:00<00:00, 66.35it/s]

0it [00:00, ?it/s]

sampling loop time step:   0%|          | 0/1 [00:00<?, ?it/s][A

sampling loop time step: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it][A
sampling loop time step: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]

1it [00:02,  2.03s/it]

sampling loop time step:   0%|          | 0/1 [00:00<?, ?it/s][A

sampling loop time step: 100%|██████████| 1/1 [00:00<00:00,  5.75it/s][A
sampling loop time step: 100%|██████████| 1/1 [00:00<00:00,  5.75it/s]

2it [00:03,  1.41s/it]
2it [00:03,  1.50s/it]
WARNING:root:DALLE2_pytorch failed to load
Eager model failed to run
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 978, in validate_model
    self.model_iter_fn(model, example_inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 337, in forward_and_backward_pass
    self.grad_scaler.scale(loss).backward()
  File "/scratch/ezyang/work/pytorch/torch/_tensor.py", line 450, in backward
    torch.autograd.backward(
  File "/scratch/ezyang/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1877, in run
    device, name, model, example_inputs, batch_size = runner.load_model(
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 282, in load_model
    self.validate_model(model, example_inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 980, in validate_model
    raise NotImplementedError("Eager model failed to run")
NotImplementedError: Eager model failed to run

Running torchbench.py LearningToPaint...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/LearningToPaint/baseline/DRL/actor.py", line 104, in forward
    def forward(self, x):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/77/c77macsht3xuqk2mqrn3rco3ortepqqfzdokmrlcme5oggulu75o.py", line 744, in call
    return (buf44, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_66, primals_67, primals_69, primals_70, primals_72, primals_73, primals_75, primals_76, primals_78, primals_79, primals_81, primals_82, primals_84, primals_85, primals_87, primals_88, primals_90, primals_91, primals_93, primals_94, primals_96, primals_97, primals_99, primals_100, primals_102, primals_103, primals_105, primals_106, primals_108, primals_109, primals_111, primals_112, primals_114, primals_115, primals_117, primals_118, primals_120, primals_121, primals_123, primals_124, primals_126, primals_127, primals_129, buf0, buf1, buf2, buf3, buf4, buf5, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf17, buf18, buf19, buf20, buf21, buf22, buf23, buf24, buf25, buf27, buf28, buf29, buf30, buf31, buf32, buf33, buf34, buf35, buf37, buf38, buf39, buf40, buf41, buf42, buf44, as_strided(primals_64, (65, 512), (512, 1)), s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train LearningToPaint                    FAIL
Running torchbench.py Super_SloMo...
ERROR:common:Failed running call_function <function grid_sample at 0x7fa17a2ae040>(*(FakeTensor(FakeTensor(..., device='meta', size=(s0, s1, s2, s2)), cuda:0), FakeTensor(FakeTensor(..., device='meta',
           size=(s0, -127.0*s1 + 32.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s7 + (s2 - 2*s5 + 12)//2 + 9)//2 + 5)//2 + 5)//2 + 5)//2 + 413.0, -127.0*s1 + 32.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s7 + (s2 - 2*s5 + 12)//2 + 9)//2 + 5)//2 + 5)//2 + 5)//2 + 413.0, 2),
           grad_fn=<StackBackward0>), cuda:0)), **{}):
TypeError: cannot determine truth value of Relational

At:
  /scratch/ezyang/work/env/lib/python3.9/site-packages/sympy/core/relational.py(511): __bool__
  /scratch/ezyang/work/pytorch/torch/fx/experimental/symbolic_shapes.py(203): bool_
  /scratch/ezyang/work/pytorch/torch/nn/functional.py(4239): grid_sample
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(52): _run_node
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(131): <lambda>
  /scratch/ezyang/work/pytorch/torch/_dynamo/utils.py(709): wrap_fake_exception
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(130): _get_fake_value
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(199): create
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/torch.py(408): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(271): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(786): CALL_FUNCTION
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(181): wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(329): step
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(359): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1620): inline_call_
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1566): inline_call
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(300): inline_user_function_return
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/nn_module.py(221): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(271): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(786): CALL_FUNCTION
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(181): wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(329): step
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(359): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1494): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(384): transform
  /scratch/ezyang/work/pytorch/torch/_dynamo/bytecode_transformation.py(341): transform_code_object
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(396): _compile
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(341): _convert_frame_assert
  /scratch/ezyang/work/pytorch/torch/_dynamo/utils.py(87): time_wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(112): _fn
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(459): _convert_frame
  /scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py(251): catch_errors
  /scratch/ezyang/work/pytorch/torch/nn/modules/module.py(1423): _call_impl
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(335): <graph break in forward_and_backward_pass>
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(332): forward_and_backward_pass
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1020): run_n_iterations
  /scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py(173): _fn
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1122): check_accuracy
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1297): run_one_model
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1906): run
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(775): inner
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1580): main
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(349): <module>

(scroll up for backtrace)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 52, in _run_node
    return node.target(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/nn/functional.py", line 4239, in grid_sample
    return torch.grid_sampler(input, grid, mode_enum, padding_mode_enum, align_corners)
RuntimeError: TypeError: cannot determine truth value of Relational

At:
  /scratch/ezyang/work/env/lib/python3.9/site-packages/sympy/core/relational.py(511): __bool__
  /scratch/ezyang/work/pytorch/torch/fx/experimental/symbolic_shapes.py(203): bool_
  /scratch/ezyang/work/pytorch/torch/nn/functional.py(4239): grid_sample
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(52): _run_node
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(131): <lambda>
  /scratch/ezyang/work/pytorch/torch/_dynamo/utils.py(709): wrap_fake_exception
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(130): _get_fake_value
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(199): create
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/torch.py(408): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(271): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(786): CALL_FUNCTION
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(181): wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(329): step
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(359): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1620): inline_call_
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1566): inline_call
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(300): inline_user_function_return
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/nn_module.py(221): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(271): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(786): CALL_FUNCTION
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(181): wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(329): step
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(359): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1494): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(384): transform
  /scratch/ezyang/work/pytorch/torch/_dynamo/bytecode_transformation.py(341): transform_code_object
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(396): _compile
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(341): _convert_frame_assert
  /scratch/ezyang/work/pytorch/torch/_dynamo/utils.py(87): time_wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(112): _fn
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(459): _convert_frame
  /scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py(251): catch_errors
  /scratch/ezyang/work/pytorch/torch/nn/modules/module.py(1423): _call_impl
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(335): <graph break in forward_and_backward_pass>
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(332): forward_and_backward_pass
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1020): run_n_iterations
  /scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py(173): _fn
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1122): check_accuracy
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1297): run_one_model
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1906): run
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(775): inner
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1580): main
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(349): <module>


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 251, in catch_errors
    return callback(frame, cache_size)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 459, in _convert_frame
    result = inner_convert(frame, cache_size)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 112, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 341, in _convert_frame_assert
    return _compile(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 396, in _compile
    out_code = transform_code_object(code, transform)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/bytecode_transformation.py", line 341, in transform_code_object
    transformations(instructions, code_options)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 384, in transform
    tracer.run()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 1494, in run
    super().run()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 359, in run
    and self.step()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 329, in step
    getattr(self, inst.opname)(inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 181, in wrapper
    return inner_fn(self, inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 786, in CALL_FUNCTION
    self.call_function(fn, args, {})
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 271, in call_function
    self.push(fn.call_function(self, args, kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/nn_module.py", line 221, in call_function
    return tx.inline_user_function_return(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 300, in inline_user_function_return
    result = InliningInstructionTranslator.inline_call(self, fn, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 1566, in inline_call
    return cls.inline_call_(parent, func, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 1620, in inline_call_
    tracer.run()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 359, in run
    and self.step()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 329, in step
    getattr(self, inst.opname)(inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 181, in wrapper
    return inner_fn(self, inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 786, in CALL_FUNCTION
    self.call_function(fn, args, {})
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 271, in call_function
    self.push(fn.call_function(self, args, kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/torch.py", line 408, in call_function
    tensor_variable = TensorVariable.create(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 199, in create
    example_value = _get_fake_value(proxy.node, tx)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 130, in _get_fake_value
    return wrap_fake_exception(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 709, in wrap_fake_exception
    return fn()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 131, in <lambda>
    lambda: _run_node(tx.output, node, args, kwargs, nnmodule)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 61, in _run_node
    raise RuntimeError(
RuntimeError: Failed running call_function <function grid_sample at 0x7fa17a2ae040>(*(FakeTensor(FakeTensor(..., device='meta', size=(s0, s1, s2, s2)), cuda:0), FakeTensor(FakeTensor(..., device='meta',
           size=(s0, -127.0*s1 + 32.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s7 + (s2 - 2*s5 + 12)//2 + 9)//2 + 5)//2 + 5)//2 + 5)//2 + 413.0, -127.0*s1 + 32.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s7 + (s2 - 2*s5 + 12)//2 + 9)//2 + 5)//2 + 5)//2 + 5)//2 + 413.0, 2),
           grad_fn=<StackBackward0>), cuda:0)), **{}):
TypeError: cannot determine truth value of Relational

At:
  /scratch/ezyang/work/env/lib/python3.9/site-packages/sympy/core/relational.py(511): __bool__
  /scratch/ezyang/work/pytorch/torch/fx/experimental/symbolic_shapes.py(203): bool_
  /scratch/ezyang/work/pytorch/torch/nn/functional.py(4239): grid_sample
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(52): _run_node
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(131): <lambda>
  /scratch/ezyang/work/pytorch/torch/_dynamo/utils.py(709): wrap_fake_exception
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(130): _get_fake_value
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py(199): create
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/torch.py(408): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(271): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(786): CALL_FUNCTION
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(181): wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(329): step
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(359): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1620): inline_call_
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1566): inline_call
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(300): inline_user_function_return
  /scratch/ezyang/work/pytorch/torch/_dynamo/variables/nn_module.py(221): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(271): call_function
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(786): CALL_FUNCTION
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(181): wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(329): step
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(359): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py(1494): run
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(384): transform
  /scratch/ezyang/work/pytorch/torch/_dynamo/bytecode_transformation.py(341): transform_code_object
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(396): _compile
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(341): _convert_frame_assert
  /scratch/ezyang/work/pytorch/torch/_dynamo/utils.py(87): time_wrapper
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(112): _fn
  /scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py(459): _convert_frame
  /scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py(251): catch_errors
  /scratch/ezyang/work/pytorch/torch/nn/modules/module.py(1423): _call_impl
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(335): <graph break in forward_and_backward_pass>
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(332): forward_and_backward_pass
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1020): run_n_iterations
  /scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py(173): _fn
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1122): check_accuracy
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1297): run_one_model
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1906): run
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(775): inner
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py(1580): main
  /scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py(349): <module>

(scroll up for backtrace)
TorchDynamo optimized model failed to run because of following error
cuda train Super_SloMo                        FAIL
Running torchbench.py alexnet...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/alexnet.py", line 47, in forward
    def forward(self, x: torch.Tensor) -> torch.Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/2x/c2x3p4um7day4xf4pwlhqsjbapihaurcsnxh5vlluu6k2i5ltfux.py", line 396, in call
    return (buf21, primals_1, primals_3, primals_5, primals_7, primals_9, primals_17, buf1, buf2, buf3, buf5, buf6, buf7, buf9, buf11, buf13, buf14, buf15, as_strided(buf16, (2, 9216), (9216, 1)), buf18, buf20, as_strided(primals_15, (1000, 4096), (4096, 1)), as_strided(primals_13, (4096, 4096), (4096, 1)), as_strided(primals_11, (4096, 9216), (9216, 1)), s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train alexnet                            FAIL
Running torchbench.py attention_is_all_you_need_pytorch...
ERROR:common:
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/attention_is_all_you_need_pytorch/transformer/Models.py", line 171, in forward
    enc_output, *_ = self.encoder(src_seq, src_mask)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/attention_is_all_you_need_pytorch/transformer/Models.py", line 172, in <graph break in forward>
    dec_output, *_ = self.decoder(trg_seq, trg_mask, enc_output, src_mask)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/attention_is_all_you_need_pytorch/transformer/Models.py", line 106, in forward
    dec_output = self.dropout(self.position_enc(self.trg_word_emb(trg_seq)))
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/attention_is_all_you_need_pytorch/transformer/Models.py", line 106, in <graph break in forward>
    dec_output = self.dropout(self.position_enc(self.trg_word_emb(trg_seq)))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 122, in compile_fx_inner
    compiled_fn = graph.compile_to_fn()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 349, in compile_to_fn
    return self.compile_to_module().call
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 335, in compile_to_module
    code = self.codegen()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 326, in codegen
    self.wrapper_code = WrapperCodeGen()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/codegen/wrapper.py", line 240, in __init__
    V.graph.sizevars.codegen(self.prefix, V.graph.graph_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/sizevars.py", line 481, in codegen
    assert not needed
AssertionError
TorchDynamo optimized model failed to run because of following error
cuda train attention_is_all_you_need_pytorch  FAIL
Running torchbench.py dcgan...
cuda train dcgan                              PASS
Running torchbench.py densenet121...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/densenet.py", line 213, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/xt/cxtlzawk5soh56qeaeeyglydxl7bo3iyqfn2zd4imra55rvwtof4.py", line 5748, in call
    return (buf895, primals_1, primals_2, primals_4, primals_6, primals_7, primals_9, primals_10, primals_12, primals_13, primals_15, primals_16, primals_18, primals_19, primals_21, primals_22, primals_24, primals_25, primals_27, primals_28, primals_30, primals_31, primals_33, primals_34, primals_36, primals_37, primals_39, primals_40, primals_42, primals_43, primals_45, primals_46, primals_48, primals_49, primals_51, primals_52, primals_54, primals_55, primals_57, primals_58, primals_60, primals_61, primals_63, primals_64, primals_66, primals_67, primals_69, primals_70, primals_72, primals_73, primals_75, primals_76, primals_78, primals_79, primals_81, primals_82, primals_84, primals_85, primals_87, primals_88, primals_90, primals_91, primals_93, primals_94, primals_96, primals_97, primals_99, primals_100, primals_102, primals_103, primals_105, primals_106, primals_108, primals_109, primals_111, primals_112, primals_114, primals_115, primals_117, primals_118, primals_120, primals_121, primals_123, primals_124, primals_126, primals_127, primals_129, primals_130, primals_132, primals_133, primals_135, primals_136, primals_138, primals_139, primals_141, primals_142, primals_144, primals_145, primals_147, primals_148, primals_150, primals_151, primals_153, primals_154, primals_156, primals_157, primals_159, primals_160, primals_162, primals_163, primals_165, primals_166, primals_168, primals_169, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, primals_316, primals_318, primals_319, primals_321, primals_322, primals_324, primals_325, primals_327, primals_328, primals_330, primals_331, primals_333, primals_334, primals_336, primals_337, primals_339, primals_340, primals_342, primals_343, primals_345, primals_346, primals_348, primals_349, primals_351, primals_352, primals_354, primals_355, primals_357, primals_358, primals_360, primals_361, primals_365, primals_366, primals_368, primals_369, primals_371, primals_372, primals_374, primals_375, primals_377, primals_378, primals_380, primals_381, primals_383, primals_384, primals_386, primals_387, primals_389, primals_390, primals_392, primals_393, primals_395, primals_396, primals_398, primals_399, primals_401, primals_402, primals_404, primals_405, primals_407, primals_408, primals_410, primals_411, primals_413, primals_414, primals_416, primals_417, primals_419, primals_420, primals_422, primals_423, primals_425, primals_426, primals_428, primals_429, primals_431, primals_432, primals_434, primals_435, primals_437, primals_438, primals_440, primals_441, primals_443, primals_444, primals_446, primals_447, primals_449, primals_450, primals_452, primals_453, primals_455, primals_456, primals_458, primals_459, primals_461, primals_462, primals_464, primals_465, primals_467, primals_468, primals_470, primals_471, primals_473, primals_474, primals_476, primals_477, primals_479, primals_480, primals_482, primals_483, primals_485, primals_486, primals_488, primals_489, primals_491, primals_492, primals_494, primals_495, primals_497, primals_498, primals_500, primals_501, primals_503, primals_504, primals_506, primals_507, primals_509, primals_510, primals_512, primals_513, primals_515, primals_516, primals_518, primals_519, primals_521, primals_522, primals_524, primals_525, primals_527, primals_528, primals_530, primals_531, primals_533, primals_534, primals_536, primals_537, primals_539, primals_540, primals_542, primals_543, primals_545, primals_546, primals_548, primals_549, primals_551, primals_552, primals_554, primals_555, primals_557, primals_558, primals_560, primals_561, primals_563, primals_564, primals_566, primals_567, primals_569, primals_570, primals_572, primals_573, primals_575, primals_576, primals_578, primals_579, primals_581, primals_582, primals_584, primals_585, primals_587, primals_588, primals_590, primals_591, primals_593, primals_594, primals_596, primals_597, primals_599, primals_600, primals_602, primals_603, primals_605, primals_606, primals_608, primals_609, primals_611, primals_612, primals_614, primals_615, primals_617, primals_618, primals_620, primals_621, primals_623, primals_624, primals_626, primals_627, primals_629, primals_630, primals_632, primals_633, primals_635, primals_636, primals_638, primals_639, primals_641, primals_642, primals_644, primals_645, primals_647, primals_648, primals_650, primals_651, primals_653, primals_654, primals_656, primals_657, primals_659, primals_660, primals_662, primals_663, primals_665, primals_666, primals_668, primals_669, primals_671, primals_672, primals_674, primals_675, primals_677, primals_678, primals_680, primals_681, primals_683, primals_684, primals_686, primals_687, primals_689, primals_690, primals_692, primals_693, primals_695, primals_696, primals_698, primals_699, primals_701, primals_702, primals_704, primals_705, primals_707, primals_708, primals_710, primals_711, primals_713, primals_714, primals_716, primals_717, primals_719, primals_720, primals_722, primals_723, primals_725, primals_726, primals_728, buf0, buf1, buf3, buf2, buf4, buf5, buf6, buf9, buf10, buf11, buf12, buf17, buf18, buf19, buf20, buf26, buf27, buf28, buf29, buf36, buf37, buf38, buf39, buf47, buf48, buf49, buf50, buf59, buf60, buf61, buf62, buf63, buf64, buf65, buf68, buf69, buf70, buf71, buf76, buf77, buf78, buf79, buf85, buf86, buf87, buf88, buf95, buf96, buf97, buf98, buf106, buf107, buf108, buf109, buf118, buf119, buf120, buf121, buf131, buf132, buf133, buf134, buf145, buf146, buf147, buf148, buf160, buf161, buf162, buf163, buf176, buf177, buf178, buf179, buf193, buf194, buf195, buf196, buf211, buf212, buf213, buf214, buf215, buf216, buf217, buf220, buf221, buf222, buf223, buf228, buf229, buf230, buf231, buf237, buf238, buf239, buf240, buf247, buf248, buf249, buf250, buf258, buf259, buf260, buf261, buf270, buf271, buf272, buf273, buf283, buf284, buf285, buf286, buf297, buf298, buf299, buf300, buf312, buf313, buf314, buf315, buf328, buf329, buf330, buf331, buf345, buf346, buf347, buf348, buf363, buf364, buf365, buf366, buf382, buf383, buf384, buf385, buf402, buf403, buf404, buf405, buf423, buf424, buf425, buf426, buf445, buf446, buf447, buf448, buf468, buf469, buf470, buf471, buf492, buf493, buf494, buf495, buf517, buf518, buf519, buf520, buf543, buf544, buf545, buf546, buf570, buf571, buf572, buf573, buf598, buf599, buf600, buf601, buf627, buf628, buf629, buf630, buf657, buf658, buf659, buf660, buf661, buf662, buf663, buf666, buf667, buf668, buf669, buf674, buf675, buf676, buf677, buf683, buf684, buf685, buf686, buf693, buf694, buf695, buf696, buf704, buf705, buf706, buf707, buf716, buf717, buf718, buf719, buf729, buf730, buf731, buf732, buf743, buf744, buf745, buf746, buf758, buf759, buf760, buf761, buf774, buf775, buf776, buf777, buf791, buf792, buf793, buf794, buf809, buf810, buf811, buf812, buf828, buf829, buf830, buf831, buf848, buf849, buf850, buf851, buf869, buf870, buf871, buf872, buf891, as_strided(buf894, (2, 1024), (1024, 1)), as_strided(primals_363, (1000, 1024), (1024, 1)), buf896, s0, 7, 7, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train densenet121                        FAIL
WARNING:root:detectron2_fcos_r_50_fpn failed to load
FCOS train is not supported by upstream detectron2. See GH Issue: https://github.com/facebookresearch/detectron2/issues/4369.
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1877, in run
    device, name, model, example_inputs, batch_size = runner.load_model(
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 251, in load_model
    benchmark = benchmark_cls(
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/model.py", line 18, in __call__
    obj = type.__call__(cls, *args, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/detectron2_fcos_r_50_fpn/__init__.py", line 15, in __init__
    super().__init__(variant="COCO-Detection/fcos_R_50_FPN_1x.py", test=test, device=device,
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/detectron2/model_factory.py", line 100, in __init__
    loader = self.setup_train(cfg, args)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/detectron2/model_factory.py", line 110, in setup_train
    raise NotImplementedError("FCOS train is not supported by upstream detectron2. " \
NotImplementedError: FCOS train is not supported by upstream detectron2. See GH Issue: https://github.com/facebookresearch/detectron2/issues/4369.

WARNING:root:detectron2_maskrcnn_r_50_c4 failed to load
Eager model failed to run
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 978, in validate_model
    self.model_iter_fn(model, example_inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 336, in forward_and_backward_pass
    loss = self.compute_loss(pred)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 326, in compute_loss
    return reduce_to_scalar_loss(pred)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/testing.py", line 87, in reduce_to_scalar_loss
    return sum([reduce_to_scalar_loss(x) for x in out]) / len(out)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/testing.py", line 87, in <listcomp>
    return sum([reduce_to_scalar_loss(x) for x in out]) / len(out)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/testing.py", line 97, in reduce_to_scalar_loss
    return sum([reduce_to_scalar_loss(value) for value in out.values()]) / len(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/testing.py", line 97, in <listcomp>
    return sum([reduce_to_scalar_loss(value) for value in out.values()]) / len(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/testing.py", line 102, in reduce_to_scalar_loss
    raise NotImplementedError("Don't know how to reduce", type(out))
NotImplementedError: ("Don't know how to reduce", <class 'detectron2.structures.instances.Instances'>)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1877, in run
    device, name, model, example_inputs, batch_size = runner.load_model(
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 282, in load_model
    self.validate_model(model, example_inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 980, in validate_model
    raise NotImplementedError("Eager model failed to run")
NotImplementedError: Eager model failed to run

Running torchbench.py dlrm...
[2022-11-06 02:55:50,221] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,224] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,228] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,231] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,235] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,238] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,241] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
[2022-11-06 02:55:50,245] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._embedding_bag
ERROR:common:'SymInt' object cannot be interpreted as an integer

While executing %range_1 : [#users=1] = call_function[target=builtins.range](args = (%getitem_12,), kwargs = {})
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/dlrm/dlrm_s_pytorch.py", line 319, in interact_features
    li = torch.tensor([i for i in range(ni) for j in range(i + offset)], device=x.device)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/dlrm/dlrm_s_pytorch.py", line 338, in forward
    return self.sequential_forward(dense_x, lS_o, lS_i)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/dlrm/dlrm_s_pytorch.py", line 355, in sequential_forward
    z = self.interact_features(x, ly)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/dlrm/dlrm_s_pytorch.py", line 302, in interact_features
    def interact_features(self, x, ly):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 462, in aot_dispatch_autograd
    out = flat_fn(*flat_args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 901, in functional_call
    out = Interpreter(mod).run(*args[params_len:], **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 243, in call_function
    return target(*args, **kwargs)
TypeError: 'SymInt' object cannot be interpreted as an integer

While executing %range_1 : [#users=1] = call_function[target=builtins.range](args = (%getitem_12,), kwargs = {})
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/dlrm/dlrm_s_pytorch.py", line 319, in interact_features
    li = torch.tensor([i for i in range(ni) for j in range(i + offset)], device=x.device)

TorchDynamo optimized model failed to run because of following error
cuda train dlrm                               FAIL
/scratch/ezyang/work/pytorch/torch/utils/tensorboard/__init__.py:4: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  if not hasattr(tensorboard, "__version__") or LooseVersion(
/scratch/ezyang/work/env/lib/python3.9/site-packages/gym/core.py:317: DeprecationWarning: [33mWARN: Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.[0m
  deprecation(
Running torchbench.py drq...
cuda train drq FAIL (TIMEOUT)
Running torchbench.py fastNLP_Bert...
[2022-11-06 03:05:59,955] torch._inductor.ir: [WARNING] DeviceCopy
ERROR:common:RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.default
  args[0]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=1] = call_function[target=torch.ops.aten.arange.default](args = (%sym_size,), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_embeddings': <class 'fastNLP.modules.encoder.bert.BertEmbeddings'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/modules/encoder/bert.py", line 230, in forward
    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/modules/encoder/bert.py", line 512, in forward
    embedding_output = self.embeddings(input_ids, token_type_ids)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 1269, in arange
    return fallback_arange(
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_ops.py", line 445, in __call__
    return self._op(*args, **kwargs or {})
RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/models/bert.py", line 265, in forward
    sequence_output = self.bert(words)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/embeddings/bert_embedding.py", line 137, in forward
    outputs = self.model(words)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/embeddings/bert_embedding.py", line 445, in forward
    max_word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item()  # 表示word piece的长度(包括padding)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/embeddings/bert_embedding.py", line 462, in <graph break in forward>
    word_indexes = words.cpu().numpy()
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/embeddings/bert_embedding.py", line 482, in <graph break in forward>
    bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/modules/encoder/bert.py", line 480, in forward
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.default
  args[0]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=1] = call_function[target=torch.ops.aten.arange.default](args = (%sym_size,), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_embeddings': <class 'fastNLP.modules.encoder.bert.BertEmbeddings'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/modules/encoder/bert.py", line 230, in forward
    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/fastNLP/modules/encoder/bert.py", line 512, in forward
    embedding_output = self.embeddings(input_ids, token_type_ids)

TorchDynamo optimized model failed to run because of following error
cuda train fastNLP_Bert                       FAIL
Running torchbench.py functorch_dp_cifar10...
cuda train functorch_dp_cifar10 FAIL (TIMEOUT)
Running torchbench.py functorch_maml_omniglot...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/container.py", line 202, in forward
    def forward(self, input):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/sh/cshw6h3my7hq3jqubhwesz632iajmgvsfeb2eapylwd5lij5nhlr.py", line 555, in call
    return (buf21, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_15, buf1, as_strided(buf22, (64, ), (1, )), buf5, buf6, buf23, buf8, as_strided(buf24, (64, ), (1, )), buf12, buf13, buf25, buf15, as_strided(buf26, (64, ), (1, )), buf19, buf27, as_strided(buf20, (5, 64), (64, 1)), as_strided(primals_13, (5, 64), (64, 1)), as_strided(buf28, (1, 64, 1, 1), (0, 1, 0, 0)), as_strided(buf29, (1, 64, 1, 1), (0, 1, 0, 0)), as_strided(buf30, (1, 64, 1, 1), (0, 1, 0, 0)), s0, 1, 1, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train functorch_maml_omniglot            FAIL
Running torchbench.py hf_Albert...
ERROR:common:'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {'self_predictions': <class 'transformers.models.albert.modeling_albert.AlbertMLMHead'>, 'self_predictions_dense': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.py", line 883, in forward
    hidden_states = self.dense(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.py", line 1003, in <graph break in forward>
    prediction_scores = self.predictions(sequence_outputs)

Gradient addition node due to multiple use of tensor around:
Module stack: {'self_embedding_hidden_mapping_in': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.py", line 470, in forward
    hidden_states = self.embedding_hidden_mapping_in(hidden_states)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 337, in <graph break in forward_and_backward_pass>
    self.grad_scaler.scale(loss).backward()
  File "/scratch/ezyang/work/pytorch/torch/_tensor.py", line 450, in backward
    torch.autograd.backward(
  File "/scratch/ezyang/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/scratch/ezyang/work/pytorch/torch/autograd/function.py", line 270, in apply
    return user_fn(self, *args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 558, in backward
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/optimizations/backends.py", line 555, in _wrapped_bw_compiler
    return disable(disable(bw_compiler)(*args, **kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 362, in bw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 199, in placeholder
    sizes, strides = self.static_sizes_strides(example)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 53, in static_sizes_strides
    size = [sympy.Integer(i) for i in ex.size()]
AttributeError: 'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {'self_predictions': <class 'transformers.models.albert.modeling_albert.AlbertMLMHead'>, 'self_predictions_dense': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.py", line 883, in forward
    hidden_states = self.dense(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.py", line 1003, in <graph break in forward>
    prediction_scores = self.predictions(sequence_outputs)

Gradient addition node due to multiple use of tensor around:
Module stack: {'self_embedding_hidden_mapping_in': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.py", line 470, in forward
    hidden_states = self.embedding_hidden_mapping_in(hidden_states)

TorchDynamo optimized model failed to run because of following error
cuda train hf_Albert                          FAIL
Running torchbench.py hf_Bart...
ERROR:common:RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.start
  args[0]: 0
  args[1]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=1] = call_function[target=torch.ops.aten.arange.start](args = (0, %add), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_embed_positions': <class 'transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 134, in forward
    positions = torch.arange(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 801, in forward
    embed_pos = self.embed_positions(input_shape)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 1269, in arange
    return fallback_arange(
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_ops.py", line 445, in __call__
    return self._op(*args, **kwargs or {})
RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 41, in forward
    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 1353, in forward
    outputs = self.model(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 1222, in forward
    encoder_outputs = self.encoder(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 735, in forward
    def forward(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.start
  args[0]: 0
  args[1]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=1] = call_function[target=torch.ops.aten.arange.start](args = (0, %add), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_embed_positions': <class 'transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 134, in forward
    positions = torch.arange(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bart/modeling_bart.py", line 801, in forward
    embed_pos = self.embed_positions(input_shape)

TorchDynamo optimized model failed to run because of following error
cuda train hf_Bart                            FAIL
Running torchbench.py hf_Bert...
ERROR:common:'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {'self_cls': <class 'transformers.models.bert.modeling_bert.BertOnlyMLMHead'>, 'self_cls_predictions': <class 'transformers.models.bert.modeling_bert.BertLMPredictionHead'>, 'self_cls_predictions_transform': <class 'transformers.models.bert.modeling_bert.BertPredictionHeadTransform'>, 'self_cls_predictions_transform_dense': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 675, in forward
    hidden_states = self.dense(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 696, in forward
    hidden_states = self.transform(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 707, in forward
    prediction_scores = self.predictions(sequence_output)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1366, in <graph break in forward>
    prediction_scores = self.cls(sequence_output)

Gradient addition node due to multiple use of tensor around:
Module stack: {'self_layer_0': <class 'transformers.models.bert.modeling_bert.BertLayer'>, 'self_layer_0_attention': <class 'transformers.models.bert.modeling_bert.BertAttention'>, 'self_layer_0_attention_self': <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>, 'self_layer_0_attention_self_query': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 289, in forward
    mixed_query_layer = self.query(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 423, in forward
    self_outputs = self.self(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 493, in forward
    self_attention_outputs = self.attention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 607, in forward
    layer_outputs = layer_module(
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 337, in <graph break in forward_and_backward_pass>
    self.grad_scaler.scale(loss).backward()
  File "/scratch/ezyang/work/pytorch/torch/_tensor.py", line 450, in backward
    torch.autograd.backward(
  File "/scratch/ezyang/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/scratch/ezyang/work/pytorch/torch/autograd/function.py", line 270, in apply
    return user_fn(self, *args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 558, in backward
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/optimizations/backends.py", line 555, in _wrapped_bw_compiler
    return disable(disable(bw_compiler)(*args, **kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 362, in bw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 199, in placeholder
    sizes, strides = self.static_sizes_strides(example)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 53, in static_sizes_strides
    size = [sympy.Integer(i) for i in ex.size()]
AttributeError: 'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {'self_cls': <class 'transformers.models.bert.modeling_bert.BertOnlyMLMHead'>, 'self_cls_predictions': <class 'transformers.models.bert.modeling_bert.BertLMPredictionHead'>, 'self_cls_predictions_transform': <class 'transformers.models.bert.modeling_bert.BertPredictionHeadTransform'>, 'self_cls_predictions_transform_dense': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 675, in forward
    hidden_states = self.dense(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 696, in forward
    hidden_states = self.transform(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 707, in forward
    prediction_scores = self.predictions(sequence_output)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1366, in <graph break in forward>
    prediction_scores = self.cls(sequence_output)

Gradient addition node due to multiple use of tensor around:
Module stack: {'self_layer_0': <class 'transformers.models.bert.modeling_bert.BertLayer'>, 'self_layer_0_attention': <class 'transformers.models.bert.modeling_bert.BertAttention'>, 'self_layer_0_attention_self': <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>, 'self_layer_0_attention_self_query': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 289, in forward
    mixed_query_layer = self.query(hidden_states)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 423, in forward
    self_outputs = self.self(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 493, in forward
    self_attention_outputs = self.attention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 607, in forward
    layer_outputs = layer_module(

TorchDynamo optimized model failed to run because of following error
cuda train hf_Bert                            FAIL
Running torchbench.py hf_BigBird...
[2022-11-06 03:18:23,053] torch._inductor.graph: [WARNING] Creating implicit fallback for:
  target: <built-in function mod>
  args[0]: s1
  args[1]: 64
ERROR:common:TypeError: mod expected 2 arguments, got 0
  target: <built-in function mod>
  args[0]: s1
  args[1]: 64

While executing %mod : [#users=1] = call_function[target=operator.mod](args = (%sym_size, 64), kwargs = {})
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
TypeError: mod expected 2 arguments, got 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 2462, in forward
    outputs = self.bert(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 2092, in forward
    ) = self._pad_to_block_size(
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/big_bird/modeling_big_bird.py", line 2224, in _pad_to_block_size
    def _pad_to_block_size(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 663, in create_aot_dispatcher_function
    return aot_dispatch_base(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 386, in aot_dispatch_base
    compiled_fw = aot_config.fw_compiler(fw_module, flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: TypeError: mod expected 2 arguments, got 0
  target: <built-in function mod>
  args[0]: s1
  args[1]: 64

While executing %mod : [#users=1] = call_function[target=operator.mod](args = (%sym_size, 64), kwargs = {})
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train hf_BigBird                         FAIL
Running torchbench.py hf_DistilBert...
ERROR:common:'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {'self_vocab_transform': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 659, in <graph break in forward>
    prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)

Gradient addition node due to multiple use of tensor around:
Module stack: {'self_transformer': <class 'transformers.models.distilbert.modeling_distilbert.Transformer'>, 'self_transformer_layer_0': <class 'transformers.models.distilbert.modeling_distilbert.TransformerBlock'>, 'self_transformer_layer_0_attention': <class 'transformers.models.distilbert.modeling_distilbert.MultiHeadSelfAttention'>, 'self_transformer_layer_0_attention_q_lin': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 207, in forward
    q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 283, in forward
    sa_output = self.attention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 345, in forward
    layer_outputs = layer_module(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 567, in <graph break in forward>
    return self.transformer(
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 337, in <graph break in forward_and_backward_pass>
    self.grad_scaler.scale(loss).backward()
  File "/scratch/ezyang/work/pytorch/torch/_tensor.py", line 450, in backward
    torch.autograd.backward(
  File "/scratch/ezyang/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/scratch/ezyang/work/pytorch/torch/autograd/function.py", line 270, in apply
    return user_fn(self, *args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 558, in backward
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/optimizations/backends.py", line 555, in _wrapped_bw_compiler
    return disable(disable(bw_compiler)(*args, **kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 362, in bw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 199, in placeholder
    sizes, strides = self.static_sizes_strides(example)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 53, in static_sizes_strides
    size = [sympy.Integer(i) for i in ex.size()]
AttributeError: 'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {'self_vocab_transform': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 659, in <graph break in forward>
    prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)

Gradient addition node due to multiple use of tensor around:
Module stack: {'self_transformer': <class 'transformers.models.distilbert.modeling_distilbert.Transformer'>, 'self_transformer_layer_0': <class 'transformers.models.distilbert.modeling_distilbert.TransformerBlock'>, 'self_transformer_layer_0_attention': <class 'transformers.models.distilbert.modeling_distilbert.MultiHeadSelfAttention'>, 'self_transformer_layer_0_attention_q_lin': <class 'torch.nn.modules.linear.Linear'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 207, in forward
    q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 283, in forward
    sa_output = self.attention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 345, in forward
    layer_outputs = layer_module(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/distilbert/modeling_distilbert.py", line 567, in <graph break in forward>
    return self.transformer(

TorchDynamo optimized model failed to run because of following error
cuda train hf_DistilBert                      FAIL
Running torchbench.py hf_GPT2...
ERROR:common:RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.start
  args[0]: 0
  args[1]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=1] = call_function[target=torch.ops.aten.arange.start](args = (0, %add), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 793, in forward
    position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 1269, in arange
    return fallback_arange(
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_ops.py", line 445, in __call__
    return self._op(*args, **kwargs or {})
RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1048, in forward
    transformer_outputs = self.transformer(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 738, in forward
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.start
  args[0]: 0
  args[1]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=1] = call_function[target=torch.ops.aten.arange.start](args = (0, %add), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 793, in forward
    position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)

TorchDynamo optimized model failed to run because of following error
cuda train hf_GPT2                            FAIL
Running torchbench.py hf_GPT2_large...
cuda train hf_GPT2_large                      PASS
Running torchbench.py hf_Longformer...
[2022-11-06 03:19:55,668] torch._inductor.ir: [WARNING] Using FallbackKernel: aten.cumsum
[2022-11-06 03:19:55,684] torch._inductor.graph: [WARNING] Creating implicit fallback for:
  target: <built-in function mod>
  args[0]: s1
  args[1]: 512
ERROR:common:TypeError: mod expected 2 arguments, got 0
  target: <built-in function mod>
  args[0]: s1
  args[1]: 512

While executing %mod : [#users=1] = call_function[target=operator.mod](args = (%sym_size_1, 512), kwargs = {})
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
TypeError: mod expected 2 arguments, got 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/longformer/modeling_longformer.py", line 1813, in forward
    outputs = self.longformer(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/longformer/modeling_longformer.py", line 1616, in forward
    @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: TypeError: mod expected 2 arguments, got 0
  target: <built-in function mod>
  args[0]: s1
  args[1]: 512

While executing %mod : [#users=1] = call_function[target=operator.mod](args = (%sym_size_1, 512), kwargs = {})
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train hf_Longformer                      FAIL
Running torchbench.py hf_Reformer...
ERROR:common:Cannot call sizes() on tensor with symbolic sizes/strides

While executing %lowmem_dropout : [#users=1] = call_function[target=torch._inductor.overrides.lowmem_dropout](args = (%self_word_embeddings,), kwargs = {p: 0.05, training: True})
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/reformer/modeling_reformer.py", line 2397, in forward
    reformer_outputs = self.reformer(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/reformer/modeling_reformer.py", line 2063, in forward
    least_common_mult_chunk_length = _get_least_common_mult_chunk_len(self.config)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/reformer/modeling_reformer.py", line 2100, in <graph break in forward>
    embedding_output = self.embeddings(
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/reformer/modeling_reformer.py", line 239, in forward
    def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, start_idx_pos_encodings=0):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 462, in aot_dispatch_autograd
    out = flat_fn(*flat_args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 901, in functional_call
    out = Interpreter(mod).run(*args[params_len:], **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 243, in call_function
    return target(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/overrides.py", line 564, in lowmem_dropout
    result = LowmemDropout.apply(input, p)
RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides

While executing %lowmem_dropout : [#users=1] = call_function[target=torch._inductor.overrides.lowmem_dropout](args = (%self_word_embeddings,), kwargs = {p: 0.05, training: True})
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train hf_Reformer                        FAIL
Running torchbench.py hf_T5...
WARNING:common:fp64 golden ref were not generated for hf_T5
ERROR:common:RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.default
  args[0]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=2] = call_function[target=torch.ops.aten.arange.default](args = (%sym_size_2,), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_model': <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>, 'self_model_encoder': <class 'transformers.models.t5.modeling_t5.T5Stack'>, 'self_model_encoder_block_0': <class 'transformers.models.t5.modeling_t5.T5Block'>, 'sub0_0': <class 'transformers.models.t5.modeling_t5.T5LayerSelfAttention'>, 'self_model_encoder_block_0_layer_0_SelfAttention': <class 'transformers.models.t5.modeling_t5.T5Attention'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 423, in compute_bias
    context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 519, in forward
    position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 570, in forward
    attention_output = self.SelfAttention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 664, in forward
    self_attention_outputs = self.layer[0](
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1033, in forward
    layer_outputs = layer_module(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1601, in forward
    encoder_outputs = self.encoder(
 |   File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 41, in forward
    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 1269, in arange
    return fallback_arange(
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_ops.py", line 445, in __call__
    return self._op(*args, **kwargs or {})
RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 40, in forward
    def forward(self, input_ids, decoder_input_ids):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.default
  args[0]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=2] = call_function[target=torch.ops.aten.arange.default](args = (%sym_size_2,), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_model': <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>, 'self_model_encoder': <class 'transformers.models.t5.modeling_t5.T5Stack'>, 'self_model_encoder_block_0': <class 'transformers.models.t5.modeling_t5.T5Block'>, 'sub0_0': <class 'transformers.models.t5.modeling_t5.T5LayerSelfAttention'>, 'self_model_encoder_block_0_layer_0_SelfAttention': <class 'transformers.models.t5.modeling_t5.T5Attention'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 423, in compute_bias
    context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 519, in forward
    position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 570, in forward
    attention_output = self.SelfAttention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 664, in forward
    self_attention_outputs = self.layer[0](
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1033, in forward
    layer_outputs = layer_module(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1601, in forward
    encoder_outputs = self.encoder(
 |   File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 41, in forward
    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

TorchDynamo optimized model failed to run because of following error
cuda train hf_T5                              FAIL
Running torchbench.py hf_T5_base...
WARNING:common:fp64 golden ref were not generated for hf_T5_base
ERROR:common:RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.default
  args[0]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=2] = call_function[target=torch.ops.aten.arange.default](args = (%sym_size_2,), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_model': <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>, 'self_model_encoder': <class 'transformers.models.t5.modeling_t5.T5Stack'>, 'self_model_encoder_block_0': <class 'transformers.models.t5.modeling_t5.T5Block'>, 'sub0_0': <class 'transformers.models.t5.modeling_t5.T5LayerSelfAttention'>, 'self_model_encoder_block_0_layer_0_SelfAttention': <class 'transformers.models.t5.modeling_t5.T5Attention'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 423, in compute_bias
    context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 519, in forward
    position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 570, in forward
    attention_output = self.SelfAttention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 664, in forward
    self_attention_outputs = self.layer[0](
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1033, in forward
    layer_outputs = layer_module(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1601, in forward
    encoder_outputs = self.encoder(
 |   File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 41, in forward
    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 1269, in arange
    return fallback_arange(
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 968, in handler
    result = ir.FallbackKernel.create(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2918, in create
    ) = cls.process_kernel(kernel, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/ir.py", line 2297, in process_kernel
    example_output = kernel(*new_args, **new_kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_ops.py", line 445, in __call__
    return self._op(*args, **kwargs or {})
RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 40, in forward
    def forward(self, input_ids, decoder_input_ids):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: RuntimeError: Overloaded torch operator invoked from Python failed to many any schema:
aten::arange() expected at most 5 argument(s) but received 7 argument(s). Declaration: aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() expected at most 6 argument(s) but received 7 argument(s). Declaration: aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor

aten::arange() Expected a value of type 'number' for argument 'end' but instead found type 'Symbol'.
Position: 1
Value: s1
Declaration: aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
Cast error details: Cannot cast s1 to number

aten::arange() expected at most 4 argument(s) but received 7 argument(s). Declaration: aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)

aten::arange() expected at most 2 argument(s) but received 7 argument(s). Declaration: aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)


  target: aten.arange.default
  args[0]: s1
  kwargs: {'dtype': torch.int64, 'device': device(type='cuda', index=0), 'pin_memory': False}

While executing %arange : [#users=2] = call_function[target=torch.ops.aten.arange.default](args = (%sym_size_2,), kwargs = {dtype: torch.int64, device: cuda:0, pin_memory: False})
Original traceback:
Module stack: {'self_model': <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>, 'self_model_encoder': <class 'transformers.models.t5.modeling_t5.T5Stack'>, 'self_model_encoder_block_0': <class 'transformers.models.t5.modeling_t5.T5Block'>, 'sub0_0': <class 'transformers.models.t5.modeling_t5.T5LayerSelfAttention'>, 'self_model_encoder_block_0_layer_0_SelfAttention': <class 'transformers.models.t5.modeling_t5.T5Attention'>}
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 423, in compute_bias
    context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 519, in forward
    position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 570, in forward
    attention_output = self.SelfAttention(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 664, in forward
    self_attention_outputs = self.layer[0](
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1033, in forward
    layer_outputs = layer_module(
 |   File "/scratch/ezyang/work/env/lib/python3.9/site-packages/transformers/models/t5/modeling_t5.py", line 1601, in forward
    encoder_outputs = self.encoder(
 |   File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/util/framework/huggingface/model_factory.py", line 41, in forward
    return self.model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

TorchDynamo optimized model failed to run because of following error
cuda train hf_T5_base                         FAIL
Running torchbench.py hf_T5_large...
cuda train hf_T5_large                        PASS
Running torchbench.py lennard_jones...
cuda train lennard_jones                      PASS
Running torchbench.py maml_omniglot...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/container.py", line 202, in forward
    def forward(self, input):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/ns/cnsnwwqwtrtycz5yxyhoiofmimptlux6ptkutivlhorddcq2yppu.py", line 298, in call
    return (buf12, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_15, primals_16, primals_18, primals_19, primals_21, primals_22, primals_24, buf1, buf2, buf3, buf13, buf5, buf6, buf7, buf14, buf9, buf10, buf15, as_strided(buf11, (5, 64), (64, 1)), as_strided(primals_13, (5, 64), (64, 1)), s0, 1, 1, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train maml_omniglot                      FAIL
Running torchbench.py mnasnet1_0...
ERROR:common:AssertionError:
  target: aten.div.Scalar
  args[0]: TensorBox(
    ReinterpretView(
      StorageBox(
        MatrixMultiply(
          name=buf0,
          layout=FixedLayout('cuda', torch.float32, size=[s0, 1280], stride=[1280, 1]),
          inputs=[InputBuffer(name='tangents_1', layout=FixedLayout('cuda', torch.float32, size=[s0, s1], stride=[s1, 1])), InputBuffer(name='permute_1', layout=FixedLayout('cuda', torch.float32, size=[1000, 1280], stride=[1280, 1]))],
          constant_args=(),
          kwargs={},
          output_view=None,
          origins={mm}
        )
      ),
      FixedLayout('cuda', torch.float32, size=[2, 1280, 7, 7], stride=[1280, 1, 0, 0]),
      origins={expand}
    )
  )
  args[1]: 49

While executing %div : [#users=1] = call_function[target=torch.ops.aten.div.Scalar](args = (%expand, %mul_156), kwargs = {})
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/torchvision/torchvision/models/mnasnet.py", line 161, in forward
    x = x.mean([2, 3])
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 239, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 211, in wrapped
    return decomp_fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 3259, in div
    dtype = get_promoted_dtype(
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 138, in get_promoted_dtype
    inps = [construct_input(arg) for arg in args]
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 138, in <listcomp>
    inps = [construct_input(arg) for arg in args]
  File "/scratch/ezyang/work/pytorch/torch/_inductor/lowering.py", line 133, in construct_input
    assert hasattr(inp, "get_dtype")
AssertionError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 337, in <graph break in forward_and_backward_pass>
    self.grad_scaler.scale(loss).backward()
  File "/scratch/ezyang/work/pytorch/torch/_tensor.py", line 450, in backward
    torch.autograd.backward(
  File "/scratch/ezyang/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/scratch/ezyang/work/pytorch/torch/autograd/function.py", line 270, in apply
    return user_fn(self, *args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 558, in backward
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/optimizations/backends.py", line 555, in _wrapped_bw_compiler
    return disable(disable(bw_compiler)(*args, **kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 362, in bw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 242, in call_function
    raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: AssertionError:
  target: aten.div.Scalar
  args[0]: TensorBox(
    ReinterpretView(
      StorageBox(
        MatrixMultiply(
          name=buf0,
          layout=FixedLayout('cuda', torch.float32, size=[s0, 1280], stride=[1280, 1]),
          inputs=[InputBuffer(name='tangents_1', layout=FixedLayout('cuda', torch.float32, size=[s0, s1], stride=[s1, 1])), InputBuffer(name='permute_1', layout=FixedLayout('cuda', torch.float32, size=[1000, 1280], stride=[1280, 1]))],
          constant_args=(),
          kwargs={},
          output_view=None,
          origins={mm}
        )
      ),
      FixedLayout('cuda', torch.float32, size=[2, 1280, 7, 7], stride=[1280, 1, 0, 0]),
      origins={expand}
    )
  )
  args[1]: 49

While executing %div : [#users=1] = call_function[target=torch.ops.aten.div.Scalar](args = (%expand, %mul_156), kwargs = {})
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/torchvision/torchvision/models/mnasnet.py", line 161, in forward
    x = x.mean([2, 3])

TorchDynamo optimized model failed to run because of following error
cuda train mnasnet1_0                         FAIL
Running torchbench.py mobilenet_v2...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/mobilenetv2.py", line 173, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/5p/c5pmlevpgwzkkcfxelj7yp3dbehlfc2tqvewaclrxmmgmqmxn4xe.py", line 1292, in call
    return (buf140, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_159, primals_160, primals_162, primals_163, primals_165, primals_166, primals_168, primals_169, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, buf0, buf2, buf3, buf5, buf6, buf7, buf8, buf10, buf11, buf13, buf14, buf15, buf16, buf18, buf19, buf21, buf22, buf23, buf24, buf26, buf27, buf29, buf30, buf31, buf32, buf34, buf35, buf37, buf38, buf39, buf40, buf42, buf43, buf45, buf46, buf47, buf48, buf50, buf51, buf53, buf54, buf55, buf56, buf58, buf59, buf61, buf62, buf63, buf64, buf66, buf67, buf69, buf70, buf71, buf72, buf74, buf75, buf77, buf78, buf79, buf80, buf82, buf83, buf85, buf86, buf87, buf88, buf90, buf91, buf93, buf94, buf95, buf96, buf98, buf99, buf101, buf102, buf103, buf104, buf106, buf107, buf109, buf110, buf111, buf112, buf114, buf115, buf117, buf118, buf119, buf120, buf122, buf123, buf125, buf126, buf127, buf128, buf130, buf131, buf133, buf134, buf135, buf136, as_strided(buf139, (2, 1280), (1280, 1)), as_strided(primals_157, (1000, 1280), (1280, 1)), buf141, buf142, buf143, buf144, buf145, buf146, buf147, buf148, buf149, buf150, buf151, buf152, buf153, buf154, buf155, buf156, buf157, buf158, buf159, buf160, buf161, buf162, buf163, buf164, buf165, buf166, buf167, buf168, buf169, buf170, buf171, buf172, buf173, buf174, buf175, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train mobilenet_v2                       FAIL
Running torchbench.py mobilenet_v2_quantized_qat...
WARNING:common:fp64 golden ref were not generated for mobilenet_v2_quantized_qat
[2022-11-06 03:25:31,839] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,849] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,868] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,876] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,884] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,899] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,907] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,914] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,931] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,938] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,955] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,962] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,969] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,986] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:31,993] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,001] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,019] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,027] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,043] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,050] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,057] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,072] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,080] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,088] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,102] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,108] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,116] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,131] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,139] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,146] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,163] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,170] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,177] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,193] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,200] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,217] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,224] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,231] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,246] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,253] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,261] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,276] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,282] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,289] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,304] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,311] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,319] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,333] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,341] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,348] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,363] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,369] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,376] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,391] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,398] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,406] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,423] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,430] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,437] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,454] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,461] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,477] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,485] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,492] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,507] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,515] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,522] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,537] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,543] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,550] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,565] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,572] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,580] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,595] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,602] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,609] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,624] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,630] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,637] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,652] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,659] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,666] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,682] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,689] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,696] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,711] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,717] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,724] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,739] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,746] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,754] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,768] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,776] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,783] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,799] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,806] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,822] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,829] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,836] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,851] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,859] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,866] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,881] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,887] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,895] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,910] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,917] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,924] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,939] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,947] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,954] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,969] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,975] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,982] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:32,997] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,004] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,012] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,028] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,036] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,043] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,059] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,067] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,083] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,090] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,097] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,112] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,120] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,128] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,142] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,148] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,156] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,171] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,178] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,185] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,200] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,207] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,215] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,229] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,235] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,243] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,258] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,265] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,272] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,287] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,295] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,302] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,318] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,325] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,342] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,349] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,356] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,361] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,366] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,370] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:25:33,375] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/fx/graph_module.py", line 660, in call_wrapped
    return self._wrapped_call(self, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/fx/graph_module.py", line 279, in __call__
    raise e
  File "/scratch/ezyang/work/pytorch/torch/fx/graph_module.py", line 269, in __call__
    return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "<eval_with_key>.8", line 4, in forward
    def forward(self, x : torch.Tensor) -> torch.Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/b3/cb3wg7pypttjp3ij5x2fyoiimpfnqnqikmfmpcbpkyhe7rcmdlmw.py", line 4565, in call
    return (buf1335, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_166, primals_167, primals_183, primals_184, primals_200, primals_201, primals_217, primals_218, primals_234, primals_235, primals_251, primals_252, primals_268, primals_269, primals_285, primals_286, primals_302, primals_303, primals_326, primals_327, primals_343, primals_344, primals_360, primals_361, primals_377, primals_378, primals_394, primals_395, primals_411, primals_412, primals_435, primals_436, primals_452, primals_453, primals_469, primals_470, primals_493, primals_494, primals_510, primals_511, primals_527, primals_528, primals_544, primals_545, primals_561, primals_562, primals_578, primals_579, primals_602, primals_603, primals_619, primals_620, primals_636, primals_637, primals_660, primals_661, primals_677, primals_678, primals_694, primals_695, primals_718, primals_719, primals_735, primals_736, primals_752, primals_753, primals_769, primals_770, primals_786, primals_787, primals_803, primals_804, primals_827, primals_828, primals_844, primals_845, primals_861, primals_862, primals_885, primals_886, primals_902, primals_903, primals_919, primals_920, primals_936, primals_937, primals_953, primals_954, primals_970, primals_971, primals_994, primals_995, primals_1011, primals_1012, primals_1028, primals_1029, primals_1052, primals_1053, primals_1069, primals_1070, primals_1086, primals_1087, primals_1103, primals_1104, buf1, buf9, buf10, buf16, buf20, buf27, buf28, buf35, buf36, buf42, buf46, buf53, buf54, buf61, buf62, buf68, buf71, buf72, buf79, buf80, buf86, buf90, buf97, buf98, buf105, buf106, buf112, buf116, buf123, buf124, buf131, buf132, buf138, buf141, buf142, buf149, buf150, buf156, buf160, buf167, buf168, buf175, buf176, buf182, buf186, buf193, buf194, buf201, buf202, buf208, buf212, buf219, buf220, buf227, buf228, buf234, buf238, buf245, buf246, buf253, buf254, buf260, buf264, buf271, buf272, buf279, buf280, buf286, buf289, buf290, buf297, buf298, buf304, buf308, buf315, buf316, buf323, buf324, buf330, buf334, buf341, buf342, buf349, buf350, buf356, buf360, buf367, buf368, buf375, buf376, buf382, buf386, buf393, buf394, buf401, buf402, buf408, buf412, buf419, buf420, buf427, buf428, buf434, buf438, buf445, buf446, buf453, buf454, buf460, buf464, buf471, buf472, buf479, buf480, buf486, buf490, buf497, buf498, buf505, buf506, buf512, buf515, buf516, buf523, buf524, buf530, buf534, buf541, buf542, buf549, buf550, buf556, buf560, buf567, buf568, buf575, buf576, buf582, buf586, buf593, buf594, buf601, buf602, buf608, buf612, buf619, buf620, buf627, buf628, buf634, buf638, buf645, buf646, buf653, buf654, buf660, buf664, buf671, buf672, buf679, buf680, buf686, buf690, buf697, buf698, buf705, buf706, buf712, buf716, buf723, buf724, buf731, buf732, buf738, buf742, buf749, buf750, buf757, buf758, buf764, buf768, buf775, buf776, buf783, buf784, buf790, buf794, buf801, buf802, buf809, buf810, buf816, buf819, buf820, buf827, buf828, buf834, buf838, buf845, buf846, buf853, buf854, buf860, buf864, buf871, buf872, buf879, buf880, buf886, buf890, buf897, buf898, buf905, buf906, buf912, buf916, buf923, buf924, buf931, buf932, buf938, buf942, buf949, buf950, buf957, buf958, buf964, buf968, buf975, buf976, buf983, buf984, buf990, buf994, buf1001, buf1002, buf1009, buf1010, buf1016, buf1020, buf1027, buf1028, buf1035, buf1036, buf1042, buf1045, buf1046, buf1053, buf1054, buf1060, buf1064, buf1071, buf1072, buf1079, buf1080, buf1086, buf1090, buf1097, buf1098, buf1105, buf1106, buf1112, buf1116, buf1123, buf1124, buf1131, buf1132, buf1138, buf1142, buf1149, buf1150, buf1157, buf1158, buf1164, buf1168, buf1175, buf1176, buf1183, buf1184, buf1190, buf1194, buf1201, buf1202, buf1209, buf1210, buf1216, buf1220, buf1227, buf1228, buf1235, buf1236, buf1242, buf1246, buf1253, buf1254, buf1261, buf1262, buf1268, buf1271, buf1272, buf1279, buf1280, buf1286, buf1290, buf1298, buf1307, buf1314, buf1320, buf1321, buf1328, buf1336, as_strided(buf1327, (1000, 1280), (1280, 1)), buf1341, buf1342, buf1343, buf1344, buf1345, buf1346, buf1347, buf1348, buf1349, buf1350, buf1351, buf1352, buf1353, buf1354, buf1355, buf1356, buf1357, buf1358, buf1359, buf1360, buf1361, buf1362, buf1363, buf1364, buf1365, buf1366, buf1367, buf1368, buf1369, buf1370, buf1371, buf1372, buf1373, buf1374, buf1375, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train mobilenet_v2_quantized_qat         FAIL
Running torchbench.py mobilenet_v3_large...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/mobilenetv3.py", line 219, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/6f/c6fidbx6g47kmntubx55f6jcmj7wta3vdpjsihaujcszsh6oa5sa.py", line 2306, in call
    return (buf171, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_36, primals_38, primals_39, primals_41, primals_42, primals_44, primals_45, primals_47, primals_49, primals_51, primals_52, primals_54, primals_55, primals_57, primals_58, primals_60, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_111, primals_113, primals_114, primals_116, primals_117, primals_119, primals_120, primals_122, primals_124, primals_126, primals_127, primals_129, primals_130, primals_132, primals_133, primals_135, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_150, primals_152, primals_153, primals_155, primals_156, primals_158, primals_159, primals_161, primals_163, primals_165, primals_166, primals_168, primals_169, primals_175, primals_176, primals_178, primals_179, primals_181, primals_182, primals_184, primals_185, primals_187, primals_188, primals_190, primals_191, primals_193, primals_194, primals_196, primals_197, primals_199, primals_200, primals_202, primals_203, primals_205, primals_206, primals_208, primals_209, primals_211, primals_212, primals_214, primals_215, primals_217, primals_218, primals_220, primals_221, primals_223, primals_224, primals_226, primals_227, primals_229, primals_230, primals_232, primals_233, primals_235, primals_236, primals_238, primals_239, primals_241, primals_242, primals_244, primals_245, primals_247, primals_248, primals_250, primals_251, primals_253, primals_254, primals_256, primals_257, primals_259, primals_260, primals_262, primals_263, primals_265, primals_266, primals_268, primals_269, primals_271, primals_272, primals_274, primals_275, primals_277, primals_278, primals_280, primals_281, primals_283, primals_284, primals_286, primals_287, primals_289, primals_290, primals_292, primals_293, primals_295, primals_296, primals_298, primals_299, primals_301, primals_302, primals_304, primals_305, primals_307, primals_308, primals_310, primals_311, primals_313, buf0, buf172, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf18, buf19, buf20, buf21, buf22, buf24, buf26, buf28, buf30, buf31, buf32, buf33, buf34, buf35, buf37, buf39, buf41, buf43, buf44, buf45, buf46, buf47, buf48, buf50, buf52, buf54, buf56, buf57, buf58, buf173, buf60, buf61, buf174, buf63, buf64, buf65, buf66, buf175, buf68, buf69, buf176, buf71, buf72, buf73, buf74, buf177, buf76, buf77, buf178, buf79, buf80, buf81, buf82, buf179, buf84, buf85, buf180, buf87, buf88, buf89, buf90, buf181, buf92, buf93, buf182, buf95, buf97, buf99, buf101, buf103, buf104, buf105, buf183, buf107, buf108, buf184, buf110, buf112, buf114, buf116, buf118, buf119, buf120, buf185, buf122, buf123, buf186, buf125, buf127, buf129, buf131, buf133, buf134, buf135, buf187, buf137, buf138, buf188, buf140, buf142, buf144, buf146, buf148, buf149, buf150, buf189, buf152, buf153, buf190, buf155, buf157, buf159, buf161, buf163, buf164, buf165, buf191, as_strided(buf168, (2, 960), (960, 1)), buf169, buf170, as_strided(primals_173, (1000, 1280), (1280, 1)), as_strided(primals_171, (1280, 960), (960, 1)), buf192, buf193, buf194, buf195, buf196, buf197, buf198, buf199, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train mobilenet_v3_large                 FAIL
Running torchbench.py moco...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/nn/parallel/distributed.py", line 1093, in forward
    output = self._run_ddp_forward(*inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/nn/parallel/distributed.py", line 1047, in _run_ddp_forward
    return module_to_run(*inputs[0], **kwargs[0])
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/moco/moco/builder.py", line 115, in forward
    def forward(self, im_q, im_k):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/e7/ce7lflz22usirdvhj27dv7435ibeuq6e57dw2aq6sdrgpdvevizi.py", line 1199, in call
    return (buf113, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_157, primals_158, primals_162, primals_163, primals_165, primals_166, primals_168, primals_169, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, primals_316, primals_318, primals_319, primals_321, buf0, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf18, buf19, buf20, buf21, buf22, buf23, buf24, buf25, buf26, buf27, buf28, buf29, buf31, buf32, buf33, buf34, buf35, buf36, buf37, buf38, buf39, buf40, buf41, buf42, buf43, buf44, buf45, buf46, buf47, buf48, buf49, buf50, buf51, buf52, buf53, buf54, buf55, buf57, buf58, buf59, buf60, buf61, buf62, buf63, buf64, buf65, buf66, buf67, buf68, buf69, buf70, buf71, buf72, buf73, buf74, buf75, buf76, buf77, buf78, buf79, buf80, buf81, buf82, buf83, buf84, buf85, buf86, buf87, buf88, buf89, buf90, buf91, buf92, buf93, buf95, buf96, buf97, buf98, buf99, buf100, buf101, buf102, buf103, buf104, buf105, buf106, as_strided(buf109, (2, 2048), (2048, 1)), buf110, buf112, as_strided(primals_160, (128, 2048), (2048, 1)), buf114, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train moco                               FAIL
Running torchbench.py nvidia_deeprecommender...
cuda train nvidia_deeprecommender             PASS
Running torchbench.py pytorch_CycleGAN_and_pix2pix...
ERROR:common:[TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf170, i3 + 256 * i2 + 65536 * i1) + load(primals_48, i1) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([1, 3, 256, 256]),
    origins={sub_23}
  )
)), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 3, 7, 7], stride=[147, 49, 7, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_3', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_5', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 3, 3], stride=[1152, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_9', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_11', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_15', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_17', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_21', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_23', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_27', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_29', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_33', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_35', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_39', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_41', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 3, 3], stride=[1152, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_45', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_47', layout=FixedLayout('cuda', torch.float32, size=[3, 64, 7, 7], stride=[3136, 49, 7, 1]))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf0', layout=FixedLayout('cuda', torch.float32, size=[1, 3, 262, 262], stride=[205932, 68644, 262, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_49, constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i3, torch.int32) - constant(3, torch.int32))) + constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i2, torch.int32) - constant(3, torch.int32))) * s1 + i1 * s1**2),
    ranges=[1, 3, 262, 262],
    origins={reflection_pad2d}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf2', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf1, i3 + 256 * i2 + 65536 * i1) + load(primals_2, i1),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={convolution}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf6, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 64, 1, 1],
            origins={reciprocal}
          )
        ),
        size=(1, 64, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_3}
      ),
      size=(1, 64),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_4}
    ),
    size=(64,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_5}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf2, i3 + 256 * i2 + 65536 * i1) - load(buf8, i1) / index_expr(65536, torch.float32) * reciprocal(sqrt(load(buf6, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={relu}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf11', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf10, i3 + 128 * i2 + 16384 * i1) + load(primals_4, i1),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={convolution_1}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf15, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 128, 1, 1],
            origins={reciprocal_1}
          )
        ),
        size=(1, 128, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_9}
      ),
      size=(1, 128),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_10}
    ),
    size=(128,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_11}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf18', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf11, i3 + 128 * i2 + 16384 * i1) - load(buf17, i1) / index_expr(16384, torch.float32) * reciprocal(sqrt(load(buf15, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={relu_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf20', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf19, i3 + 64 * i2 + 4096 * i1) + load(primals_6, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_2}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf22, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_2}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_15}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_16}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_17}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf24', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf20, i3 + 64 * i2 + 4096 * i1) - load(buf23, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf22, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf25', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf24, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf27', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf26, i3 + 64 * i2 + 4096 * i1) + load(primals_8, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_3}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf29, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_3}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_21}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_22}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_23}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf31', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf27, i3 + 64 * i2 + 4096 * i1) - load(buf30, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf29, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_3}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf32', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf31, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf34', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf33, i3 + 64 * i2 + 4096 * i1) + load(primals_10, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_4}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf36, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_4}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_27}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_28}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_29}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf38', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf24, i3 + 64 * i2 + 4096 * i1) + load(buf34, i3 + 64 * i2 + 4096 * i1) - load(buf37, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf36, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf39', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf38, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_3}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf41', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf40, i3 + 64 * i2 + 4096 * i1) + load(primals_12, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_5}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf43, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_5}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_33}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_34}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_35}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf45', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf41, i3 + 64 * i2 + 4096 * i1) - load(buf44, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf43, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_4}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf46', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf45, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_4}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf48', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf47, i3 + 64 * i2 + 4096 * i1) + load(primals_14, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_6}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf50, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_6}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_39}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_40}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_41}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf52', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf38, i3 + 64 * i2 + 4096 * i1) + load(buf48, i3 + 64 * i2 + 4096 * i1) - load(buf51, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf50, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf53', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf52, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_5}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf55', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf54, i3 + 64 * i2 + 4096 * i1) + load(primals_16, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_7}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf57, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_7}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_45}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_46}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_47}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf59', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf55, i3 + 64 * i2 + 4096 * i1) - load(buf58, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf57, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_5}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf60', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf59, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_6}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf62', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf61, i3 + 64 * i2 + 4096 * i1) + load(primals_18, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_8}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf64, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_8}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_51}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_52}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_53}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf66', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf52, i3 + 64 * i2 + 4096 * i1) + load(buf62, i3 + 64 * i2 + 4096 * i1) - load(buf65, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf64, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_20}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf66, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_7}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf69', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf68, i3 + 64 * i2 + 4096 * i1) + load(primals_20, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_9}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf71, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_9}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_57}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_58}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_59}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf73', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf69, i3 + 64 * i2 + 4096 * i1) - load(buf72, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf71, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_6}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf73, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_8}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf76', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf75, i3 + 64 * i2 + 4096 * i1) + load(primals_22, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_10}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf78, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_10}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_63}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_64}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_65}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf80', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf66, i3 + 64 * i2 + 4096 * i1) + load(buf76, i3 + 64 * i2 + 4096 * i1) - load(buf79, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf78, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_25}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf81', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf80, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_9}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf83', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf82, i3 + 64 * i2 + 4096 * i1) + load(primals_24, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_11}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf85, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_11}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_69}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_70}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_71}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf87', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf83, i3 + 64 * i2 + 4096 * i1) - load(buf86, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf85, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_7}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf88', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf87, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf90', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf89, i3 + 64 * i2 + 4096 * i1) + load(primals_26, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_12}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf92, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_12}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_75}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_76}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_77}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf94', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf80, i3 + 64 * i2 + 4096 * i1) + load(buf90, i3 + 64 * i2 + 4096 * i1) - load(buf93, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf92, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_30}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf95', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf94, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_11}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf97', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf96, i3 + 64 * i2 + 4096 * i1) + load(primals_28, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_13}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf99, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_13}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_81}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_82}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_83}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf101', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf97, i3 + 64 * i2 + 4096 * i1) - load(buf100, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf99, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_8}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf102', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf101, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_12}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf104', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf103, i3 + 64 * i2 + 4096 * i1) + load(primals_30, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_14}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf106, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_14}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_87}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_88}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_89}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf108', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf94, i3 + 64 * i2 + 4096 * i1) + load(buf104, i3 + 64 * i2 + 4096 * i1) - load(buf107, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf106, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_35}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf109', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf108, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_13}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf111', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf110, i3 + 64 * i2 + 4096 * i1) + load(primals_32, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_15}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf113, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_15}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_93}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_94}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_95}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf115', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf111, i3 + 64 * i2 + 4096 * i1) - load(buf114, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf113, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_9}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf116', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf115, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_14}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf118', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf117, i3 + 64 * i2 + 4096 * i1) + load(primals_34, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_16}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf120, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_16}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_99}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_100}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_101}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf122', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf108, i3 + 64 * i2 + 4096 * i1) + load(buf118, i3 + 64 * i2 + 4096 * i1) - load(buf121, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf120, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_40}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf123', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf122, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf125', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf124, i3 + 64 * i2 + 4096 * i1) + load(primals_36, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_17}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf127, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_17}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_105}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_106}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_107}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf129', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf125, i3 + 64 * i2 + 4096 * i1) - load(buf128, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf127, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf130', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf129, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_16}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf132', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf131, i3 + 64 * i2 + 4096 * i1) + load(primals_38, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_18}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf134, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_18}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_111}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_112}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_113}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf136', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf122, i3 + 64 * i2 + 4096 * i1) + load(buf132, i3 + 64 * i2 + 4096 * i1) - load(buf135, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf134, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_45}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf137', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf136, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_17}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf139', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf138, i3 + 64 * i2 + 4096 * i1) + load(primals_40, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_19}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf141, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_19}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_117}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_118}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_119}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf143', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf139, i3 + 64 * i2 + 4096 * i1) - load(buf142, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf141, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_11}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf144', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf143, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_18}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf146', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf145, i3 + 64 * i2 + 4096 * i1) + load(primals_42, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_20}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf148, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_20}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_123}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_124}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_125}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf150', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf136, i3 + 64 * i2 + 4096 * i1) + load(buf146, i3 + 64 * i2 + 4096 * i1) - load(buf149, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf148, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_50}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf152', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf151, i3 + 128 * i2 + 16384 * i1) + load(primals_44, i1),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={convolution_21}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf156, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 128, 1, 1],
            origins={reciprocal_21}
          )
        ),
        size=(1, 128, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_129}
      ),
      size=(1, 128),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_130}
    ),
    size=(128,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_131}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf159', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf152, i3 + 128 * i2 + 16384 * i1) - load(buf158, i1) / index_expr(16384, torch.float32) * reciprocal(sqrt(load(buf156, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={relu_12}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf161', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf160, i3 + 256 * i2 + 65536 * i1) + load(primals_46, i1),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={convolution_22}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf165, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 64, 1, 1],
            origins={reciprocal_22}
          )
        ),
        size=(1, 64, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_135}
      ),
      size=(1, 64),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_136}
    ),
    size=(64,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_137}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf168', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf161, i3 + 256 * i2 + 65536 * i1) - load(buf167, i1) / index_expr(65536, torch.float32) * reciprocal(sqrt(load(buf165, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={relu_13}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf169', layout=FixedLayout('cuda', torch.float32, size=[1, 64, 262, 262], stride=[4393216, 68644, 262, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf168, constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i3, torch.int32) - constant(3, torch.int32))) + 256 * constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i2, torch.int32) - constant(3, torch.int32))) + 65536 * i1),
    ranges=[1, 64, 262, 262],
    origins={reflection_pad2d_19}
  ))
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf170, i3 + 256 * i2 + 65536 * i1) + load(primals_48, i1) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([1, 3, 256, 256]),
    origins={sub_23}
  )
)), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf167, i1) / index_expr(65536, torch.float32),
                  ranges=[1, 64, 1, 1],
                  origins={mean_22}
                )
              ),
              size=(1, 64, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_132}
            ),
            size=(1, 64),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_133}
          ),
          size=(64,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_134}
        ),
        size=(1, 64),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_138}
      ),
      size=(1, 64, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_139}
    ),
    size=(1, 64, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_140}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf158, i1) / index_expr(16384, torch.float32),
                  ranges=[1, 128, 1, 1],
                  origins={mean_21}
                )
              ),
              size=(1, 128, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_126}
            ),
            size=(1, 128),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_127}
          ),
          size=(128,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_128}
        ),
        size=(1, 128),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_150}
      ),
      size=(1, 128, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_151}
    ),
    size=(1, 128, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_152}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf149, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_20}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_120}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_121}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_122}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_162}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_163}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_164}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf142, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_19}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_114}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_115}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_116}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_174}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_175}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_176}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf135, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_18}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_108}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_109}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_110}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_186}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_187}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_188}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf128, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_17}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_102}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_103}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_104}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_198}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_199}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_200}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf121, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_16}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_96}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_97}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_98}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_210}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_211}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_212}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf114, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_15}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_90}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_91}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_92}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_222}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_223}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_224}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf107, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_14}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_84}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_85}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_86}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_234}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_235}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_236}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf100, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_13}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_78}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_79}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_80}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_246}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_247}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_248}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf93, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_12}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_72}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_73}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_74}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_258}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_259}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_260}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf86, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_11}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_66}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_67}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_68}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_270}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_271}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_272}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf79, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_10}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_60}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_61}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_62}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_282}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_283}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_284}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf72, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_9}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_54}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_55}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_56}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_294}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_295}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_296}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf65, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_8}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_48}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_49}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_50}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_306}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_307}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_308}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf58, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_7}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_42}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_43}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_44}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_318}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_319}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_320}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf51, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_6}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_36}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_37}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_38}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_330}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_331}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_332}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf44, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_5}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_30}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_31}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_32}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_342}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_343}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_344}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf37, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_4}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_24}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_25}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_26}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_354}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_355}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_356}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf30, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_3}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_18}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_19}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_20}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_366}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_367}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_368}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf23, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_2}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_12}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_13}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_14}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_378}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_379}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_380}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf17, i1) / index_expr(16384, torch.float32),
                  ranges=[1, 128, 1, 1],
                  origins={mean_1}
                )
              ),
              size=(1, 128, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_6}
            ),
            size=(1, 128),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_7}
          ),
          size=(128,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_8}
        ),
        size=(1, 128),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_390}
      ),
      size=(1, 128, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_391}
    ),
    size=(1, 128, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_392}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf8, i1) / index_expr(65536, torch.float32),
                  ranges=[1, 64, 1, 1],
                  origins={mean}
                )
              ),
              size=(1, 64, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze}
            ),
            size=(1, 64),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_1}
          ),
          size=(64,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_2}
        ),
        size=(1, 64),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_402}
      ),
      size=(1, 64, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_403}
    ),
    size=(1, 64, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_404}
  )
), 1, 64, 256, 256, 128, 128, 128, 256, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128, 256, 256]

While executing return [sub_23, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, reflection_pad2d, convolution, squeeze_5, view_3, convolution_1, squeeze_11, view_9, convolution_2, squeeze_17, view_15, reflection_pad2d_1, convolution_3, squeeze_23, view_21, reflection_pad2d_2, convolution_4, squeeze_29, add_10, reflection_pad2d_3, convolution_5, squeeze_35, view_29, reflection_pad2d_4, convolution_6, squeeze_41, add_15, reflection_pad2d_5, convolution_7, squeeze_47, view_37, reflection_pad2d_6, convolution_8, squeeze_53, add_20, reflection_pad2d_7, convolution_9, squeeze_59, view_45, reflection_pad2d_8, convolution_10, squeeze_65, add_25, reflection_pad2d_9, convolution_11, squeeze_71, view_53, reflection_pad2d_10, convolution_12, squeeze_77, add_30, reflection_pad2d_11, convolution_13, squeeze_83, view_61, reflection_pad2d_12, convolution_14, squeeze_89, add_35, reflection_pad2d_13, convolution_15, squeeze_95, view_69, reflection_pad2d_14, convolution_16, squeeze_101, add_40, reflection_pad2d_15, convolution_17, squeeze_107, view_77, reflection_pad2d_16, convolution_18, squeeze_113, add_45, reflection_pad2d_17, convolution_19, squeeze_119, view_85, reflection_pad2d_18, convolution_20, squeeze_125, add_50, convolution_21, squeeze_131, view_93, convolution_22, squeeze_137, view_99, reflection_pad2d_19, sub_23, unsqueeze_140, unsqueeze_152, unsqueeze_164, unsqueeze_176, unsqueeze_188, unsqueeze_200, unsqueeze_212, unsqueeze_224, unsqueeze_236, unsqueeze_248, unsqueeze_260, unsqueeze_272, unsqueeze_284, unsqueeze_296, unsqueeze_308, unsqueeze_320, unsqueeze_332, unsqueeze_344, unsqueeze_356, unsqueeze_368, unsqueeze_380, unsqueeze_392, unsqueeze_404, sym_size, mul, sym_size_1, sym_size_2, mul_3, sym_size_3, sym_size_4, mul_6, sym_size_5, sym_size_6, sym_size_7, sym_size_8, sym_size_9, sym_size_10, sym_size_11, sym_size_12, sym_size_13, sym_size_14, sym_size_15, sym_size_16, sym_size_17, sym_size_18, sym_size_19, sym_size_20, sym_size_21, sym_size_22, sym_size_23, sym_size_24, sym_size_25, sym_size_26, sym_size_27, sym_size_28, sym_size_29, sym_size_30, sym_size_31, sym_size_32, sym_size_33, sym_size_34, sym_size_35, sym_size_36, sym_size_37, sym_size_38, sym_size_39, sym_size_40, sym_size_41, sym_size_42, sym_size_43, sym_size_44, sym_size_45, sym_size_46]
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_CycleGAN_and_pix2pix/models/networks.py", line 370, in forward
    def forward(self, input):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 267, in output
    assert all(
AssertionError: [TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf170, i3 + 256 * i2 + 65536 * i1) + load(primals_48, i1) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([1, 3, 256, 256]),
    origins={sub_23}
  )
)), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 3, 7, 7], stride=[147, 49, 7, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_3', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_5', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 3, 3], stride=[1152, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_9', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_11', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_15', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_17', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_21', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_23', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_27', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_29', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_33', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_35', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_39', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_41', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 3, 3], stride=[1152, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_45', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_47', layout=FixedLayout('cuda', torch.float32, size=[3, 64, 7, 7], stride=[3136, 49, 7, 1]))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf0', layout=FixedLayout('cuda', torch.float32, size=[1, 3, 262, 262], stride=[205932, 68644, 262, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_49, constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i3, torch.int32) - constant(3, torch.int32))) + constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i2, torch.int32) - constant(3, torch.int32))) * s1 + i1 * s1**2),
    ranges=[1, 3, 262, 262],
    origins={reflection_pad2d}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf2', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf1, i3 + 256 * i2 + 65536 * i1) + load(primals_2, i1),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={convolution}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf6, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 64, 1, 1],
            origins={reciprocal}
          )
        ),
        size=(1, 64, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_3}
      ),
      size=(1, 64),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_4}
    ),
    size=(64,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_5}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf2, i3 + 256 * i2 + 65536 * i1) - load(buf8, i1) / index_expr(65536, torch.float32) * reciprocal(sqrt(load(buf6, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={relu}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf11', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf10, i3 + 128 * i2 + 16384 * i1) + load(primals_4, i1),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={convolution_1}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf15, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 128, 1, 1],
            origins={reciprocal_1}
          )
        ),
        size=(1, 128, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_9}
      ),
      size=(1, 128),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_10}
    ),
    size=(128,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_11}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf18', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf11, i3 + 128 * i2 + 16384 * i1) - load(buf17, i1) / index_expr(16384, torch.float32) * reciprocal(sqrt(load(buf15, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={relu_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf20', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf19, i3 + 64 * i2 + 4096 * i1) + load(primals_6, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_2}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf22, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_2}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_15}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_16}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_17}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf24', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf20, i3 + 64 * i2 + 4096 * i1) - load(buf23, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf22, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf25', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf24, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf27', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf26, i3 + 64 * i2 + 4096 * i1) + load(primals_8, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_3}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf29, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_3}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_21}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_22}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_23}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf31', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf27, i3 + 64 * i2 + 4096 * i1) - load(buf30, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf29, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_3}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf32', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf31, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf34', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf33, i3 + 64 * i2 + 4096 * i1) + load(primals_10, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_4}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf36, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_4}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_27}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_28}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_29}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf38', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf24, i3 + 64 * i2 + 4096 * i1) + load(buf34, i3 + 64 * i2 + 4096 * i1) - load(buf37, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf36, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf39', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf38, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_3}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf41', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf40, i3 + 64 * i2 + 4096 * i1) + load(primals_12, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_5}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf43, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_5}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_33}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_34}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_35}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf45', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf41, i3 + 64 * i2 + 4096 * i1) - load(buf44, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf43, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_4}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf46', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf45, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_4}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf48', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf47, i3 + 64 * i2 + 4096 * i1) + load(primals_14, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_6}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf50, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_6}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_39}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_40}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_41}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf52', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf38, i3 + 64 * i2 + 4096 * i1) + load(buf48, i3 + 64 * i2 + 4096 * i1) - load(buf51, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf50, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf53', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf52, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_5}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf55', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf54, i3 + 64 * i2 + 4096 * i1) + load(primals_16, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_7}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf57, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_7}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_45}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_46}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_47}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf59', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf55, i3 + 64 * i2 + 4096 * i1) - load(buf58, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf57, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_5}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf60', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf59, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_6}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf62', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf61, i3 + 64 * i2 + 4096 * i1) + load(primals_18, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_8}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf64, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_8}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_51}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_52}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_53}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf66', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf52, i3 + 64 * i2 + 4096 * i1) + load(buf62, i3 + 64 * i2 + 4096 * i1) - load(buf65, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf64, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_20}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf66, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_7}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf69', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf68, i3 + 64 * i2 + 4096 * i1) + load(primals_20, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_9}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf71, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_9}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_57}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_58}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_59}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf73', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf69, i3 + 64 * i2 + 4096 * i1) - load(buf72, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf71, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_6}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf73, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_8}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf76', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf75, i3 + 64 * i2 + 4096 * i1) + load(primals_22, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_10}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf78, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_10}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_63}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_64}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_65}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf80', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf66, i3 + 64 * i2 + 4096 * i1) + load(buf76, i3 + 64 * i2 + 4096 * i1) - load(buf79, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf78, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_25}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf81', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf80, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_9}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf83', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf82, i3 + 64 * i2 + 4096 * i1) + load(primals_24, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_11}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf85, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_11}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_69}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_70}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_71}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf87', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf83, i3 + 64 * i2 + 4096 * i1) - load(buf86, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf85, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_7}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf88', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf87, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf90', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf89, i3 + 64 * i2 + 4096 * i1) + load(primals_26, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_12}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf92, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_12}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_75}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_76}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_77}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf94', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf80, i3 + 64 * i2 + 4096 * i1) + load(buf90, i3 + 64 * i2 + 4096 * i1) - load(buf93, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf92, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_30}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf95', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf94, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_11}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf97', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf96, i3 + 64 * i2 + 4096 * i1) + load(primals_28, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_13}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf99, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_13}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_81}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_82}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_83}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf101', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf97, i3 + 64 * i2 + 4096 * i1) - load(buf100, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf99, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_8}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf102', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf101, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_12}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf104', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf103, i3 + 64 * i2 + 4096 * i1) + load(primals_30, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_14}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf106, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_14}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_87}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_88}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_89}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf108', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf94, i3 + 64 * i2 + 4096 * i1) + load(buf104, i3 + 64 * i2 + 4096 * i1) - load(buf107, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf106, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_35}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf109', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf108, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_13}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf111', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf110, i3 + 64 * i2 + 4096 * i1) + load(primals_32, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_15}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf113, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_15}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_93}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_94}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_95}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf115', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf111, i3 + 64 * i2 + 4096 * i1) - load(buf114, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf113, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_9}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf116', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf115, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_14}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf118', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf117, i3 + 64 * i2 + 4096 * i1) + load(primals_34, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_16}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf120, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_16}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_99}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_100}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_101}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf122', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf108, i3 + 64 * i2 + 4096 * i1) + load(buf118, i3 + 64 * i2 + 4096 * i1) - load(buf121, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf120, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_40}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf123', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf122, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf125', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf124, i3 + 64 * i2 + 4096 * i1) + load(primals_36, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_17}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf127, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_17}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_105}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_106}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_107}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf129', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf125, i3 + 64 * i2 + 4096 * i1) - load(buf128, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf127, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf130', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf129, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_16}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf132', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf131, i3 + 64 * i2 + 4096 * i1) + load(primals_38, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_18}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf134, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_18}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_111}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_112}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_113}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf136', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf122, i3 + 64 * i2 + 4096 * i1) + load(buf132, i3 + 64 * i2 + 4096 * i1) - load(buf135, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf134, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_45}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf137', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf136, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_17}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf139', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf138, i3 + 64 * i2 + 4096 * i1) + load(primals_40, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_19}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf141, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_19}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_117}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_118}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_119}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf143', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf139, i3 + 64 * i2 + 4096 * i1) - load(buf142, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf141, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={relu_11}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf144', layout=FixedLayout('cuda', torch.float32, size=[1, 256, 66, 66], stride=[1115136, 4356, 66, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf143, constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i3, torch.int32) - constant(1, torch.int32))) + 64 * constant(63, torch.int32) - abs(constant(63, torch.int32) - abs(index_expr(i2, torch.int32) - constant(1, torch.int32))) + 4096 * i1),
    ranges=[1, 256, 66, 66],
    origins={reflection_pad2d_18}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf146', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf145, i3 + 64 * i2 + 4096 * i1) + load(primals_42, i1),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={convolution_20}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf148, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 256, 1, 1],
            origins={reciprocal_20}
          )
        ),
        size=(1, 256, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_123}
      ),
      size=(1, 256),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_124}
    ),
    size=(256,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_125}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf150', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 256, 64, 64]), stride=[1048576, 4096, 64, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf136, i3 + 64 * i2 + 4096 * i1) + load(buf146, i3 + 64 * i2 + 4096 * i1) - load(buf149, i1) / index_expr(4096, torch.float32) * reciprocal(sqrt(load(buf148, i1) / index_expr(4096, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32),
    ranges=torch.Size([1, 256, 64, 64]),
    origins={add_50}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf152', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf151, i3 + 128 * i2 + 16384 * i1) + load(primals_44, i1),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={convolution_21}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf156, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 128, 1, 1],
            origins={reciprocal_21}
          )
        ),
        size=(1, 128, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_129}
      ),
      size=(1, 128),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_130}
    ),
    size=(128,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_131}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf159', layout=FixedLayout('cuda', torch.float32, size=torch.Size([1, 128, 128, 128]), stride=[2097152, 16384, 128, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf152, i3 + 128 * i2 + 16384 * i1) - load(buf158, i1) / index_expr(16384, torch.float32) * reciprocal(sqrt(load(buf156, i1) / index_expr(16384, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 128, 128, 128]),
    origins={relu_12}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf161', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf160, i3 + 256 * i2 + 65536 * i1) + load(primals_46, i1),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={convolution_22}
  ))
)), TensorBox(
  View(
    View(
      View(
        StorageBox(
          Pointwise(
            'cuda',
            torch.float32,
            reciprocal(sqrt(load(buf165, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))),
            ranges=[1, 64, 1, 1],
            origins={reciprocal_22}
          )
        ),
        size=(1, 64, 1),
        reindex=lambda i0, i1, i2: [0, i1, 0, 0],
        origins={squeeze_135}
      ),
      size=(1, 64),
      reindex=lambda i0, i1: [0, i1, 0],
      origins={squeeze_136}
    ),
    size=(64,),
    reindex=lambda i0: [0, i0],
    origins={squeeze_137}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf168', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([1, 64, 256, 256]), stride=[4194304, 65536, 256, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf161, i3 + 256 * i2 + 65536 * i1) - load(buf167, i1) / index_expr(65536, torch.float32) * reciprocal(sqrt(load(buf165, i1) / index_expr(65536, torch.float32) + constant(1e-05, torch.float32))) * constant(1, torch.float32) + constant(0, torch.float32)),
    ranges=torch.Size([1, 64, 256, 256]),
    origins={relu_13}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf169', layout=FixedLayout('cuda', torch.float32, size=[1, 64, 262, 262], stride=[4393216, 68644, 262, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf168, constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i3, torch.int32) - constant(3, torch.int32))) + 256 * constant(255, torch.int32) - abs(constant(255, torch.int32) - abs(index_expr(i2, torch.int32) - constant(3, torch.int32))) + 65536 * i1),
    ranges=[1, 64, 262, 262],
    origins={reflection_pad2d_19}
  ))
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf170, i3 + 256 * i2 + 65536 * i1) + load(primals_48, i1) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([1, 3, 256, 256]),
    origins={sub_23}
  )
)), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf167, i1) / index_expr(65536, torch.float32),
                  ranges=[1, 64, 1, 1],
                  origins={mean_22}
                )
              ),
              size=(1, 64, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_132}
            ),
            size=(1, 64),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_133}
          ),
          size=(64,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_134}
        ),
        size=(1, 64),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_138}
      ),
      size=(1, 64, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_139}
    ),
    size=(1, 64, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_140}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf158, i1) / index_expr(16384, torch.float32),
                  ranges=[1, 128, 1, 1],
                  origins={mean_21}
                )
              ),
              size=(1, 128, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_126}
            ),
            size=(1, 128),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_127}
          ),
          size=(128,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_128}
        ),
        size=(1, 128),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_150}
      ),
      size=(1, 128, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_151}
    ),
    size=(1, 128, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_152}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf149, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_20}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_120}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_121}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_122}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_162}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_163}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_164}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf142, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_19}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_114}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_115}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_116}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_174}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_175}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_176}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf135, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_18}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_108}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_109}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_110}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_186}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_187}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_188}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf128, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_17}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_102}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_103}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_104}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_198}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_199}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_200}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf121, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_16}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_96}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_97}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_98}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_210}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_211}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_212}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf114, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_15}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_90}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_91}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_92}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_222}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_223}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_224}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf107, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_14}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_84}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_85}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_86}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_234}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_235}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_236}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf100, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_13}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_78}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_79}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_80}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_246}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_247}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_248}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf93, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_12}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_72}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_73}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_74}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_258}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_259}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_260}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf86, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_11}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_66}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_67}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_68}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_270}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_271}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_272}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf79, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_10}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_60}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_61}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_62}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_282}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_283}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_284}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf72, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_9}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_54}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_55}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_56}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_294}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_295}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_296}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf65, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_8}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_48}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_49}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_50}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_306}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_307}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_308}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf58, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_7}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_42}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_43}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_44}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_318}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_319}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_320}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf51, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_6}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_36}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_37}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_38}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_330}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_331}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_332}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf44, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_5}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_30}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_31}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_32}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_342}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_343}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_344}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf37, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_4}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_24}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_25}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_26}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_354}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_355}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_356}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf30, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_3}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_18}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_19}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_20}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_366}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_367}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_368}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf23, i1) / index_expr(4096, torch.float32),
                  ranges=[1, 256, 1, 1],
                  origins={mean_2}
                )
              ),
              size=(1, 256, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_12}
            ),
            size=(1, 256),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_13}
          ),
          size=(256,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_14}
        ),
        size=(1, 256),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_378}
      ),
      size=(1, 256, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_379}
    ),
    size=(1, 256, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_380}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf17, i1) / index_expr(16384, torch.float32),
                  ranges=[1, 128, 1, 1],
                  origins={mean_1}
                )
              ),
              size=(1, 128, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze_6}
            ),
            size=(1, 128),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_7}
          ),
          size=(128,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_8}
        ),
        size=(1, 128),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_390}
      ),
      size=(1, 128, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_391}
    ),
    size=(1, 128, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_392}
  )
), TensorBox(
  View(
    View(
      View(
        View(
          View(
            View(
              StorageBox(
                Pointwise(
                  'cuda',
                  torch.float32,
                  load(buf8, i1) / index_expr(65536, torch.float32),
                  ranges=[1, 64, 1, 1],
                  origins={mean}
                )
              ),
              size=(1, 64, 1),
              reindex=lambda i0, i1, i2: [0, i1, 0, 0],
              origins={squeeze}
            ),
            size=(1, 64),
            reindex=lambda i0, i1: [0, i1, 0],
            origins={squeeze_1}
          ),
          size=(64,),
          reindex=lambda i0: [0, i0],
          origins={squeeze_2}
        ),
        size=(1, 64),
        reindex=lambda i0, i1: [i1],
        origins={unsqueeze_402}
      ),
      size=(1, 64, 1),
      reindex=lambda i0, i1, i2: [0, i1],
      origins={unsqueeze_403}
    ),
    size=(1, 64, 1, 1),
    reindex=lambda i0, i1, i2, i3: [0, i1, 0],
    origins={unsqueeze_404}
  )
), 1, 64, 256, 256, 128, 128, 128, 256, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128, 256, 256]

While executing return [sub_23, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, reflection_pad2d, convolution, squeeze_5, view_3, convolution_1, squeeze_11, view_9, convolution_2, squeeze_17, view_15, reflection_pad2d_1, convolution_3, squeeze_23, view_21, reflection_pad2d_2, convolution_4, squeeze_29, add_10, reflection_pad2d_3, convolution_5, squeeze_35, view_29, reflection_pad2d_4, convolution_6, squeeze_41, add_15, reflection_pad2d_5, convolution_7, squeeze_47, view_37, reflection_pad2d_6, convolution_8, squeeze_53, add_20, reflection_pad2d_7, convolution_9, squeeze_59, view_45, reflection_pad2d_8, convolution_10, squeeze_65, add_25, reflection_pad2d_9, convolution_11, squeeze_71, view_53, reflection_pad2d_10, convolution_12, squeeze_77, add_30, reflection_pad2d_11, convolution_13, squeeze_83, view_61, reflection_pad2d_12, convolution_14, squeeze_89, add_35, reflection_pad2d_13, convolution_15, squeeze_95, view_69, reflection_pad2d_14, convolution_16, squeeze_101, add_40, reflection_pad2d_15, convolution_17, squeeze_107, view_77, reflection_pad2d_16, convolution_18, squeeze_113, add_45, reflection_pad2d_17, convolution_19, squeeze_119, view_85, reflection_pad2d_18, convolution_20, squeeze_125, add_50, convolution_21, squeeze_131, view_93, convolution_22, squeeze_137, view_99, reflection_pad2d_19, sub_23, unsqueeze_140, unsqueeze_152, unsqueeze_164, unsqueeze_176, unsqueeze_188, unsqueeze_200, unsqueeze_212, unsqueeze_224, unsqueeze_236, unsqueeze_248, unsqueeze_260, unsqueeze_272, unsqueeze_284, unsqueeze_296, unsqueeze_308, unsqueeze_320, unsqueeze_332, unsqueeze_344, unsqueeze_356, unsqueeze_368, unsqueeze_380, unsqueeze_392, unsqueeze_404, sym_size, mul, sym_size_1, sym_size_2, mul_3, sym_size_3, sym_size_4, mul_6, sym_size_5, sym_size_6, sym_size_7, sym_size_8, sym_size_9, sym_size_10, sym_size_11, sym_size_12, sym_size_13, sym_size_14, sym_size_15, sym_size_16, sym_size_17, sym_size_18, sym_size_19, sym_size_20, sym_size_21, sym_size_22, sym_size_23, sym_size_24, sym_size_25, sym_size_26, sym_size_27, sym_size_28, sym_size_29, sym_size_30, sym_size_31, sym_size_32, sym_size_33, sym_size_34, sym_size_35, sym_size_36, sym_size_37, sym_size_38, sym_size_39, sym_size_40, sym_size_41, sym_size_42, sym_size_43, sym_size_44, sym_size_45, sym_size_46]
Original traceback:
None
--dataroot /scratch/ezyang/work/torchbenchmark/torchbenchmark/data/.data/pytorch_CycleGAN_and_pix2pix_inputs/datasets/horse2zebra --name horse2zebra --model cycle_gan --display_id 0 --n_epochs 3 --n_epochs_decay 3 --gpu_ids 0 --checkpoints_dir /scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_CycleGAN_and_pix2pix/.data/checkpoints
TorchDynamo optimized model failed to run because of following error
cuda train pytorch_CycleGAN_and_pix2pix       FAIL
Running torchbench.py pytorch_stargan...
ERROR:common:[TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf88, i3 + 128 * i2 + 16384 * i1 + 49152 * i0) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([16, 3, 128, 128]),
    origins={sub_17}
  )
)), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 8, 7, 7], stride=[392, 49, 7, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_52', layout=FixedLayout('cuda', torch.float32, size=[3, 64, 7, 7], stride=[3136, 49, 7, 1]))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[s0, 8, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), inputs=[ComputedBuffer(name='buf0', layout=AliasedLayout('cuda', torch.float32, size=[s0, s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_104, i3 + i1 * s2**2 + i2 * s2 + i0 * s1 * s2**2),
    ranges=[s0, s1, s2, s2],
    origins={cat}
  )), ComputedBuffer(name='buf1', layout=AliasedLayout('cuda', torch.float32, size=[s0, 8 - s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_105, i1 + i0 * 8 + -1 * s1),
    ranges=[s0, 8 - s1, s2, s2],
    origins={cat}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf4', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_2, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_53, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_3}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf6', layout=FlexibleLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_54, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_4}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf3,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 64, 128, 128]), stride=[1048576, 16384, 128, 1]),
        inputs=[ConcatKernel(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[s0, 8, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), inputs=[ComputedBuffer(name='buf0', layout=AliasedLayout('cuda', torch.float32, size=[s0, s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(primals_104, i3 + i1 * s2**2 + i2 * s2 + i0 * s1 * s2**2),
          ranges=[s0, s1, s2, s2],
          origins={cat}
        )), ComputedBuffer(name='buf1', layout=AliasedLayout('cuda', torch.float32, size=[s0, 8 - s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(primals_105, i1 + i0 * 8 + -1 * s1),
          ranges=[s0, 8 - s1, s2, s2],
          origins={cat}
        ))]), InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 8, 7, 7], stride=[392, 49, 7, 1]))],
        constant_args=(None, (1, 1), (3, 3), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution}
      )
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_1}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf7', layout=FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf3, i3 + 128 * i2 + 16384 * ModularIndexing(i1, 1, 64) + 1048576 * ModularIndexing(i1 + 64 * i0, 64, 16)) - load(buf5, i1 + 64 * i0) * reciprocal(sqrt(load(buf6, i1 + 64 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf4, i1 + 64 * i0) + load(primals_3, ModularIndexing(i1, 1, 64))),
        ranges=(16, 64, 128, 128),
        origins={relu}
      ))
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_5}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_5, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_5}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf10', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_56, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_7}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf11', layout=FlexibleLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_57, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_8}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf8,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 128, 64, 64]), stride=[524288, 4096, 64, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf7', layout=FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf3, i3 + 128 * i2 + 16384 * ModularIndexing(i1, 1, 64) + 1048576 * ModularIndexing(i1 + 64 * i0, 64, 16)) - load(buf5, i1 + 64 * i0) * reciprocal(sqrt(load(buf6, i1 + 64 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf4, i1 + 64 * i0) + load(primals_3, ModularIndexing(i1, 1, 64))),
                ranges=(16, 64, 128, 128),
                origins={relu}
              ))
            ),
            size=(1, 1024, 128, 128),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
            origins={view_5}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]),
          origins={convolution_1}
        ), InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_1}
      )
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_9}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf12', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf8, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf10, i1 + 128 * i0) * reciprocal(sqrt(load(buf11, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf9, i1 + 128 * i0) + load(primals_6, ModularIndexing(i1, 1, 128))),
        ranges=(16, 128, 64, 64),
        origins={relu_1}
      ))
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_13}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf14', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_8, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_9}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf15', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_59, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_11}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf16', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_60, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_12}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf13,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf12', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf8, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf10, i1 + 128 * i0) * reciprocal(sqrt(load(buf11, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf9, i1 + 128 * i0) + load(primals_6, ModularIndexing(i1, 1, 128))),
                ranges=(16, 128, 64, 64),
                origins={relu_1}
              ))
            ),
            size=(1, 2048, 64, 64),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
            origins={view_13}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]),
          origins={convolution_2}
        ), InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_2}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_17}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf13, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf15, i1 + 256 * i0) * reciprocal(sqrt(load(buf16, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf14, i1 + 256 * i0) + load(primals_9, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_2}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_21}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf19', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_11, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_13}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf20', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_62, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf21', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_63, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_16}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf18,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf13, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf15, i1 + 256 * i0) * reciprocal(sqrt(load(buf16, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf14, i1 + 256 * i0) + load(primals_9, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_2}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_21}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_3}
        ), InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_3}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_25}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf22', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf18, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf20, i1 + 256 * i0) * reciprocal(sqrt(load(buf21, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf19, i1 + 256 * i0) + load(primals_12, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_3}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_29}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf24', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_14, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_17}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf25', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_65, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_19}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf26', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_66, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_20}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf23,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf22', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf18, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf20, i1 + 256 * i0) * reciprocal(sqrt(load(buf21, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf19, i1 + 256 * i0) + load(primals_12, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_3}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_29}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_4}
        ), InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_4}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_33}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf27', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf17, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) + load(buf23, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf25, i1 + 256 * i0) * reciprocal(sqrt(load(buf26, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf24, i1 + 256 * i0) + load(primals_15, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf29', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_17, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_21}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf30', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_68, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_23}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf31', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_69, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_24}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf28,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf27', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf17, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) + load(buf23, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf25, i1 + 256 * i0) * reciprocal(sqrt(load(buf26, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf24, i1 + 256 * i0) + load(primals_15, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_10}
        )), InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_5}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_37}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf32', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf28, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf30, i1 + 256 * i0) * reciprocal(sqrt(load(buf31, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf29, i1 + 256 * i0) + load(primals_18, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_4}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_41}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf34', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_20, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_25}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf35', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_71, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_27}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf36', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_72, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_28}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf33,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf32', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf28, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf30, i1 + 256 * i0) * reciprocal(sqrt(load(buf31, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf29, i1 + 256 * i0) + load(primals_18, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_4}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_41}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_6}
        ), InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_6}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_45}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf27, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf33, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf35, i1 + 256 * i0) * reciprocal(sqrt(load(buf36, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf34, i1 + 256 * i0) + load(primals_21, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf39', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_23, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_29}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf40', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_74, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_31}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf41', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_75, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_32}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf38,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf27, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf33, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf35, i1 + 256 * i0) * reciprocal(sqrt(load(buf36, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf34, i1 + 256 * i0) + load(primals_21, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_15}
        )), InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_7}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_49}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf38, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf40, i1 + 256 * i0) * reciprocal(sqrt(load(buf41, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf39, i1 + 256 * i0) + load(primals_24, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_5}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_53}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_26, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_33}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf45', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_77, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_35}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf46', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_78, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_36}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf43,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf38, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf40, i1 + 256 * i0) * reciprocal(sqrt(load(buf41, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf39, i1 + 256 * i0) + load(primals_24, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_5}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_53}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_8}
        ), InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_8}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_57}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf47', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf37, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf43, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf45, i1 + 256 * i0) * reciprocal(sqrt(load(buf46, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf44, i1 + 256 * i0) + load(primals_27, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_20}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf49', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_29, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_37}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf50', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_80, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_39}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf51', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_81, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_40}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf48,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf47', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf37, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf43, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf45, i1 + 256 * i0) * reciprocal(sqrt(load(buf46, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf44, i1 + 256 * i0) + load(primals_27, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_20}
        )), InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_9}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_61}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf52', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf48, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf50, i1 + 256 * i0) * reciprocal(sqrt(load(buf51, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf49, i1 + 256 * i0) + load(primals_30, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_6}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_65}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf54', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_32, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_41}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf55', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_83, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_43}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf56', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_84, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_44}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf53,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf52', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf48, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf50, i1 + 256 * i0) * reciprocal(sqrt(load(buf51, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf49, i1 + 256 * i0) + load(primals_30, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_6}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_65}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_10}
        ), InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_10}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_69}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf57', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf47, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf53, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf55, i1 + 256 * i0) * reciprocal(sqrt(load(buf56, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf54, i1 + 256 * i0) + load(primals_33, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_25}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf59', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_35, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_45}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf60', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_86, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_47}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf61', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_87, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_48}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf58,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf57', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf47, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf53, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf55, i1 + 256 * i0) * reciprocal(sqrt(load(buf56, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf54, i1 + 256 * i0) + load(primals_33, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_25}
        )), InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_11}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_73}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf62', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf58, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf60, i1 + 256 * i0) * reciprocal(sqrt(load(buf61, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf59, i1 + 256 * i0) + load(primals_36, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_7}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_77}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf64', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_38, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_49}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf65', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_89, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_51}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf66', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_90, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_52}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf63,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf62', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf58, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf60, i1 + 256 * i0) * reciprocal(sqrt(load(buf61, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf59, i1 + 256 * i0) + load(primals_36, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_7}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_77}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_12}
        ), InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_12}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_81}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf57, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf63, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf65, i1 + 256 * i0) * reciprocal(sqrt(load(buf66, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf64, i1 + 256 * i0) + load(primals_39, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_30}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf69', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_41, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_53}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf70', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_92, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_55}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf71', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_93, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_56}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf68,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf57, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf63, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf65, i1 + 256 * i0) * reciprocal(sqrt(load(buf66, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf64, i1 + 256 * i0) + load(primals_39, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_30}
        )), InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_13}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_85}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf68, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf70, i1 + 256 * i0) * reciprocal(sqrt(load(buf71, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf69, i1 + 256 * i0) + load(primals_42, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_8}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_89}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_44, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_57}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf75', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_95, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_59}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf76', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_96, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_60}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf73,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf68, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf70, i1 + 256 * i0) * reciprocal(sqrt(load(buf71, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf69, i1 + 256 * i0) + load(primals_42, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_8}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_89}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_14}
        ), InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_14}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_93}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf77', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf67, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf73, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf75, i1 + 256 * i0) * reciprocal(sqrt(load(buf76, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf74, i1 + 256 * i0) + load(primals_45, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_35}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf79', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_47, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_61}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf80', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_98, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_63}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf81', layout=FlexibleLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_99, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_64}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf78,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 128, 64, 64]), stride=[524288, 4096, 64, 1]),
        inputs=[ComputedBuffer(name='buf77', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf67, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf73, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf75, i1 + 256 * i0) * reciprocal(sqrt(load(buf76, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf74, i1 + 256 * i0) + load(primals_45, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_35}
        )), InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), True, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_15}
      )
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_97}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf82', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf78, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf80, i1 + 128 * i0) * reciprocal(sqrt(load(buf81, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf79, i1 + 128 * i0) + load(primals_48, ModularIndexing(i1, 1, 128))),
        ranges=(16, 128, 64, 64),
        origins={relu_9}
      ))
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_101}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf84', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_50, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_65}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf85', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_101, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_67}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf86', layout=FlexibleLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_102, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_68}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf83,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 64, 128, 128]), stride=[1048576, 16384, 128, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf82', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf78, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf80, i1 + 128 * i0) * reciprocal(sqrt(load(buf81, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf79, i1 + 128 * i0) + load(primals_48, ModularIndexing(i1, 1, 128))),
                ranges=(16, 128, 64, 64),
                origins={relu_9}
              ))
            ),
            size=(1, 2048, 64, 64),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
            origins={view_101}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]),
          origins={convolution_16}
        ), InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), True, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_16}
      )
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_105}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf87', layout=FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf83, i3 + 128 * i2 + 16384 * ModularIndexing(i1, 1, 64) + 1048576 * ModularIndexing(i1 + 64 * i0, 64, 16)) - load(buf85, i1 + 64 * i0) * reciprocal(sqrt(load(buf86, i1 + 64 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf84, i1 + 64 * i0) + load(primals_51, ModularIndexing(i1, 1, 64))),
        ranges=(16, 64, 128, 128),
        origins={relu_10}
      ))
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_109}
  )
), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf88, i3 + 128 * i2 + 16384 * i1 + 49152 * i0) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([16, 3, 128, 128]),
    origins={sub_17}
  )
)), s0, 128, 128, 64, 64, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 128, 128]

While executing return [sub_17, primals_1, primals_4, primals_7, primals_10, primals_13, primals_16, primals_19, primals_22, primals_25, primals_28, primals_31, primals_34, primals_37, primals_40, primals_43, primals_46, primals_49, primals_52, cat, repeat_1, repeat_3, repeat_4, view_1, view_5, repeat_5, repeat_7, repeat_8, view_9, view_13, repeat_9, repeat_11, repeat_12, view_17, view_21, repeat_13, repeat_15, repeat_16, view_25, view_29, repeat_17, repeat_19, repeat_20, view_33, add_10, repeat_21, repeat_23, repeat_24, view_37, view_41, repeat_25, repeat_27, repeat_28, view_45, add_15, repeat_29, repeat_31, repeat_32, view_49, view_53, repeat_33, repeat_35, repeat_36, view_57, add_20, repeat_37, repeat_39, repeat_40, view_61, view_65, repeat_41, repeat_43, repeat_44, view_69, add_25, repeat_45, repeat_47, repeat_48, view_73, view_77, repeat_49, repeat_51, repeat_52, view_81, add_30, repeat_53, repeat_55, repeat_56, view_85, view_89, repeat_57, repeat_59, repeat_60, view_93, add_35, repeat_61, repeat_63, repeat_64, view_97, view_101, repeat_65, repeat_67, repeat_68, view_105, view_109, sub_17, sym_size_4, sym_size_5, sym_size_6, sym_size_8, sym_size_9, sym_size_11, sym_size_12, sym_size_14, sym_size_15, sym_size_17, sym_size_18, sym_size_19, sym_size_20, sym_size_22, sym_size_23, sym_size_24, sym_size_25, sym_size_27, sym_size_28, sym_size_29, sym_size_30, sym_size_32, sym_size_33, sym_size_34, sym_size_35, sym_size_37, sym_size_38, sym_size_39, sym_size_40, sym_size_42, sym_size_43, sym_size_44, sym_size_45, sym_size_47, sym_size_48]
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_stargan/model.py", line 55, in forward
    def forward(self, x, c):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 267, in output
    assert all(
AssertionError: [TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf88, i3 + 128 * i2 + 16384 * i1 + 49152 * i0) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([16, 3, 128, 128]),
    origins={sub_17}
  )
)), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 8, 7, 7], stride=[392, 49, 7, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_52', layout=FixedLayout('cuda', torch.float32, size=[3, 64, 7, 7], stride=[3136, 49, 7, 1]))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[s0, 8, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), inputs=[ComputedBuffer(name='buf0', layout=AliasedLayout('cuda', torch.float32, size=[s0, s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_104, i3 + i1 * s2**2 + i2 * s2 + i0 * s1 * s2**2),
    ranges=[s0, s1, s2, s2],
    origins={cat}
  )), ComputedBuffer(name='buf1', layout=AliasedLayout('cuda', torch.float32, size=[s0, 8 - s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_105, i1 + i0 * 8 + -1 * s1),
    ranges=[s0, 8 - s1, s2, s2],
    origins={cat}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf4', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_2, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_53, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_3}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf6', layout=FlexibleLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_54, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_4}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf3,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 64, 128, 128]), stride=[1048576, 16384, 128, 1]),
        inputs=[ConcatKernel(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[s0, 8, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), inputs=[ComputedBuffer(name='buf0', layout=AliasedLayout('cuda', torch.float32, size=[s0, s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(primals_104, i3 + i1 * s2**2 + i2 * s2 + i0 * s1 * s2**2),
          ranges=[s0, s1, s2, s2],
          origins={cat}
        )), ComputedBuffer(name='buf1', layout=AliasedLayout('cuda', torch.float32, size=[s0, 8 - s1, s2, s2], stride=[8*s2**2, s2**2, s2, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(primals_105, i1 + i0 * 8 + -1 * s1),
          ranges=[s0, 8 - s1, s2, s2],
          origins={cat}
        ))]), InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 8, 7, 7], stride=[392, 49, 7, 1]))],
        constant_args=(None, (1, 1), (3, 3), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution}
      )
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_1}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf7', layout=FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf3, i3 + 128 * i2 + 16384 * ModularIndexing(i1, 1, 64) + 1048576 * ModularIndexing(i1 + 64 * i0, 64, 16)) - load(buf5, i1 + 64 * i0) * reciprocal(sqrt(load(buf6, i1 + 64 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf4, i1 + 64 * i0) + load(primals_3, ModularIndexing(i1, 1, 64))),
        ranges=(16, 64, 128, 128),
        origins={relu}
      ))
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_5}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_5, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_5}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf10', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_56, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_7}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf11', layout=FlexibleLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_57, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_8}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf8,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 128, 64, 64]), stride=[524288, 4096, 64, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf7', layout=FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf3, i3 + 128 * i2 + 16384 * ModularIndexing(i1, 1, 64) + 1048576 * ModularIndexing(i1 + 64 * i0, 64, 16)) - load(buf5, i1 + 64 * i0) * reciprocal(sqrt(load(buf6, i1 + 64 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf4, i1 + 64 * i0) + load(primals_3, ModularIndexing(i1, 1, 64))),
                ranges=(16, 64, 128, 128),
                origins={relu}
              ))
            ),
            size=(1, 1024, 128, 128),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
            origins={view_5}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]),
          origins={convolution_1}
        ), InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_1}
      )
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_9}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf12', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf8, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf10, i1 + 128 * i0) * reciprocal(sqrt(load(buf11, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf9, i1 + 128 * i0) + load(primals_6, ModularIndexing(i1, 1, 128))),
        ranges=(16, 128, 64, 64),
        origins={relu_1}
      ))
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_13}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf14', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_8, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_9}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf15', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_59, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_11}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf16', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_60, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_12}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf13,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf12', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf8, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf10, i1 + 128 * i0) * reciprocal(sqrt(load(buf11, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf9, i1 + 128 * i0) + load(primals_6, ModularIndexing(i1, 1, 128))),
                ranges=(16, 128, 64, 64),
                origins={relu_1}
              ))
            ),
            size=(1, 2048, 64, 64),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
            origins={view_13}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]),
          origins={convolution_2}
        ), InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_2}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_17}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf13, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf15, i1 + 256 * i0) * reciprocal(sqrt(load(buf16, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf14, i1 + 256 * i0) + load(primals_9, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_2}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_21}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf19', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_11, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_13}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf20', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_62, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf21', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_63, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_16}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf18,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf13, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf15, i1 + 256 * i0) * reciprocal(sqrt(load(buf16, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf14, i1 + 256 * i0) + load(primals_9, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_2}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_21}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_3}
        ), InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_3}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_25}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf22', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf18, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf20, i1 + 256 * i0) * reciprocal(sqrt(load(buf21, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf19, i1 + 256 * i0) + load(primals_12, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_3}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_29}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf24', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_14, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_17}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf25', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_65, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_19}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf26', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_66, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_20}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf23,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf22', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf18, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf20, i1 + 256 * i0) * reciprocal(sqrt(load(buf21, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf19, i1 + 256 * i0) + load(primals_12, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_3}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_29}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_4}
        ), InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_4}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_33}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf27', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf17, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) + load(buf23, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf25, i1 + 256 * i0) * reciprocal(sqrt(load(buf26, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf24, i1 + 256 * i0) + load(primals_15, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_10}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf29', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_17, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_21}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf30', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_68, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_23}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf31', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_69, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_24}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf28,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf27', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf17, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) + load(buf23, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf25, i1 + 256 * i0) * reciprocal(sqrt(load(buf26, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf24, i1 + 256 * i0) + load(primals_15, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_10}
        )), InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_5}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_37}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf32', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf28, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf30, i1 + 256 * i0) * reciprocal(sqrt(load(buf31, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf29, i1 + 256 * i0) + load(primals_18, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_4}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_41}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf34', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_20, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_25}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf35', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_71, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_27}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf36', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_72, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_28}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf33,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf32', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf28, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf30, i1 + 256 * i0) * reciprocal(sqrt(load(buf31, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf29, i1 + 256 * i0) + load(primals_18, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_4}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_41}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_6}
        ), InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_6}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_45}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf27, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf33, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf35, i1 + 256 * i0) * reciprocal(sqrt(load(buf36, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf34, i1 + 256 * i0) + load(primals_21, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_15}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf39', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_23, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_29}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf40', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_74, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_31}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf41', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_75, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_32}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf38,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf27, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf33, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf35, i1 + 256 * i0) * reciprocal(sqrt(load(buf36, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf34, i1 + 256 * i0) + load(primals_21, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_15}
        )), InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_7}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_49}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf38, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf40, i1 + 256 * i0) * reciprocal(sqrt(load(buf41, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf39, i1 + 256 * i0) + load(primals_24, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_5}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_53}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_26, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_33}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf45', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_77, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_35}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf46', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_78, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_36}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf43,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf38, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf40, i1 + 256 * i0) * reciprocal(sqrt(load(buf41, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf39, i1 + 256 * i0) + load(primals_24, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_5}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_53}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_8}
        ), InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_8}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_57}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf47', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf37, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf43, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf45, i1 + 256 * i0) * reciprocal(sqrt(load(buf46, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf44, i1 + 256 * i0) + load(primals_27, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_20}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf49', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_29, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_37}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf50', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_80, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_39}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf51', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_81, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_40}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf48,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf47', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf37, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf43, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf45, i1 + 256 * i0) * reciprocal(sqrt(load(buf46, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf44, i1 + 256 * i0) + load(primals_27, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_20}
        )), InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_9}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_61}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf52', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf48, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf50, i1 + 256 * i0) * reciprocal(sqrt(load(buf51, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf49, i1 + 256 * i0) + load(primals_30, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_6}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_65}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf54', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_32, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_41}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf55', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_83, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_43}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf56', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_84, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_44}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf53,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf52', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf48, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf50, i1 + 256 * i0) * reciprocal(sqrt(load(buf51, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf49, i1 + 256 * i0) + load(primals_30, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_6}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_65}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_10}
        ), InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_10}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_69}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf57', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf47, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf53, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf55, i1 + 256 * i0) * reciprocal(sqrt(load(buf56, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf54, i1 + 256 * i0) + load(primals_33, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_25}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf59', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_35, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_45}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf60', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_86, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_47}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf61', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_87, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_48}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf58,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf57', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf47, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf53, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf55, i1 + 256 * i0) * reciprocal(sqrt(load(buf56, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf54, i1 + 256 * i0) + load(primals_33, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_25}
        )), InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_11}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_73}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf62', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf58, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf60, i1 + 256 * i0) * reciprocal(sqrt(load(buf61, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf59, i1 + 256 * i0) + load(primals_36, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_7}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_77}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf64', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_38, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_49}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf65', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_89, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_51}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf66', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_90, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_52}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf63,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf62', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf58, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf60, i1 + 256 * i0) * reciprocal(sqrt(load(buf61, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf59, i1 + 256 * i0) + load(primals_36, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_7}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_77}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_12}
        ), InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_12}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_81}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf57, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf63, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf65, i1 + 256 * i0) * reciprocal(sqrt(load(buf66, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf64, i1 + 256 * i0) + load(primals_39, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_30}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf69', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_41, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_53}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf70', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_92, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_55}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf71', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_93, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_56}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf68,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf57, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf63, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf65, i1 + 256 * i0) * reciprocal(sqrt(load(buf66, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf64, i1 + 256 * i0) + load(primals_39, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_30}
        )), InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_13}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_85}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf68, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf70, i1 + 256 * i0) * reciprocal(sqrt(load(buf71, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf69, i1 + 256 * i0) + load(primals_42, ModularIndexing(i1, 1, 256))),
        ranges=(16, 256, 32, 32),
        origins={relu_8}
      ))
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_89}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_44, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_57}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf75', layout=FixedLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_95, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_59}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf76', layout=FlexibleLayout('cuda', torch.float32, size=[256*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_96, ModularIndexing(i0, 1, 256)),
    ranges=[256*s0],
    origins={repeat_60}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf73,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 256, 32, 32]), stride=[262144, 1024, 32, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf68, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf70, i1 + 256 * i0) * reciprocal(sqrt(load(buf71, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf69, i1 + 256 * i0) + load(primals_42, ModularIndexing(i1, 1, 256))),
                ranges=(16, 256, 32, 32),
                origins={relu_8}
              ))
            ),
            size=(1, 4096, 32, 32),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
            origins={view_89}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]),
          origins={convolution_14}
        ), InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1]))],
        constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_14}
      )
    ),
    size=(1, 4096, 32, 32),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 256, 16), ModularIndexing(i1, 1, 256), i2, i3],
    origins={view_93}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf77', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf67, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf73, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf75, i1 + 256 * i0) * reciprocal(sqrt(load(buf76, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf74, i1 + 256 * i0) + load(primals_45, ModularIndexing(i1, 1, 256)),
    ranges=(16, 256, 32, 32),
    origins={add_35}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf79', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_47, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_61}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf80', layout=FixedLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_98, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_63}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf81', layout=FlexibleLayout('cuda', torch.float32, size=[128*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_99, ModularIndexing(i0, 1, 128)),
    ranges=[128*s0],
    origins={repeat_64}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf78,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 128, 64, 64]), stride=[524288, 4096, 64, 1]),
        inputs=[ComputedBuffer(name='buf77', layout=FixedLayout('cuda', torch.float32, size=(16, 256, 32, 32), stride=[262144, 1024, 32, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf67, i3 + 32 * i2 + 1024 * i1 + 262144 * i0) + load(buf73, i3 + 32 * i2 + 1024 * ModularIndexing(i1, 1, 256) + 262144 * ModularIndexing(i1 + 256 * i0, 256, 16)) - load(buf75, i1 + 256 * i0) * reciprocal(sqrt(load(buf76, i1 + 256 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf74, i1 + 256 * i0) + load(primals_45, ModularIndexing(i1, 1, 256)),
          ranges=(16, 256, 32, 32),
          origins={add_35}
        )), InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[256, 128, 4, 4], stride=[2048, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), True, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_15}
      )
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_97}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf82', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf78, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf80, i1 + 128 * i0) * reciprocal(sqrt(load(buf81, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf79, i1 + 128 * i0) + load(primals_48, ModularIndexing(i1, 1, 128))),
        ranges=(16, 128, 64, 64),
        origins={relu_9}
      ))
    ),
    size=(1, 2048, 64, 64),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
    origins={view_101}
  )
), TensorBox(StorageBox(
  ComputedBuffer(name='buf84', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_50, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_65}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf85', layout=FixedLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_101, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_67}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf86', layout=FlexibleLayout('cuda', torch.float32, size=[64*s0], stride=[1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(primals_102, ModularIndexing(i0, 1, 64)),
    ranges=[64*s0],
    origins={repeat_68}
  ))
)), TensorBox(
  View(
    StorageBox(
      Convolution(
        name=buf83,
        layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([16, 64, 128, 128]), stride=[1048576, 16384, 128, 1]),
        inputs=[ReinterpretView(
          View(
            StorageBox(
              ComputedBuffer(name='buf82', layout=FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]), data=Pointwise(
                'cuda',
                torch.float32,
                relu(load(buf78, i3 + 64 * i2 + 4096 * ModularIndexing(i1, 1, 128) + 524288 * ModularIndexing(i1 + 128 * i0, 128, 16)) - load(buf80, i1 + 128 * i0) * reciprocal(sqrt(load(buf81, i1 + 128 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf79, i1 + 128 * i0) + load(primals_48, ModularIndexing(i1, 1, 128))),
                ranges=(16, 128, 64, 64),
                origins={relu_9}
              ))
            ),
            size=(1, 2048, 64, 64),
            reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 128, 16), ModularIndexing(i1, 1, 128), i2, i3],
            origins={view_101}
          ),
          FixedLayout('cuda', torch.float32, size=(16, 128, 64, 64), stride=[524288, 4096, 64, 1]),
          origins={convolution_16}
        ), InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[128, 64, 4, 4], stride=[1024, 16, 4, 1]))],
        constant_args=(None, (2, 2), (1, 1), (1, 1), True, (0, 0), 1),
        kwargs={},
        output_view=None,
        origins={convolution_16}
      )
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_105}
  )
), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf87', layout=FixedLayout('cuda', torch.float32, size=(16, 64, 128, 128), stride=[1048576, 16384, 128, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        relu(load(buf83, i3 + 128 * i2 + 16384 * ModularIndexing(i1, 1, 64) + 1048576 * ModularIndexing(i1 + 64 * i0, 64, 16)) - load(buf85, i1 + 64 * i0) * reciprocal(sqrt(load(buf86, i1 + 64 * i0) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(buf84, i1 + 64 * i0) + load(primals_51, ModularIndexing(i1, 1, 64))),
        ranges=(16, 64, 128, 128),
        origins={relu_10}
      ))
    ),
    size=(1, 1024, 128, 128),
    reindex=lambda i0, i1, i2, i3: [ModularIndexing(i1, 64, 16), ModularIndexing(i1, 1, 64), i2, i3],
    origins={view_109}
  )
), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.float32,
    reciprocal(exp(load(buf88, i3 + 128 * i2 + 16384 * i1 + 49152 * i0) * constant(-2.0, torch.float32)) + constant(1.0, torch.float32)) * constant(2.0, torch.float32) - constant(1.0, torch.float32),
    ranges=torch.Size([16, 3, 128, 128]),
    origins={sub_17}
  )
)), s0, 128, 128, 64, 64, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 128, 128]

While executing return [sub_17, primals_1, primals_4, primals_7, primals_10, primals_13, primals_16, primals_19, primals_22, primals_25, primals_28, primals_31, primals_34, primals_37, primals_40, primals_43, primals_46, primals_49, primals_52, cat, repeat_1, repeat_3, repeat_4, view_1, view_5, repeat_5, repeat_7, repeat_8, view_9, view_13, repeat_9, repeat_11, repeat_12, view_17, view_21, repeat_13, repeat_15, repeat_16, view_25, view_29, repeat_17, repeat_19, repeat_20, view_33, add_10, repeat_21, repeat_23, repeat_24, view_37, view_41, repeat_25, repeat_27, repeat_28, view_45, add_15, repeat_29, repeat_31, repeat_32, view_49, view_53, repeat_33, repeat_35, repeat_36, view_57, add_20, repeat_37, repeat_39, repeat_40, view_61, view_65, repeat_41, repeat_43, repeat_44, view_69, add_25, repeat_45, repeat_47, repeat_48, view_73, view_77, repeat_49, repeat_51, repeat_52, view_81, add_30, repeat_53, repeat_55, repeat_56, view_85, view_89, repeat_57, repeat_59, repeat_60, view_93, add_35, repeat_61, repeat_63, repeat_64, view_97, view_101, repeat_65, repeat_67, repeat_68, view_105, view_109, sub_17, sym_size_4, sym_size_5, sym_size_6, sym_size_8, sym_size_9, sym_size_11, sym_size_12, sym_size_14, sym_size_15, sym_size_17, sym_size_18, sym_size_19, sym_size_20, sym_size_22, sym_size_23, sym_size_24, sym_size_25, sym_size_27, sym_size_28, sym_size_29, sym_size_30, sym_size_32, sym_size_33, sym_size_34, sym_size_35, sym_size_37, sym_size_38, sym_size_39, sym_size_40, sym_size_42, sym_size_43, sym_size_44, sym_size_45, sym_size_47, sym_size_48]
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train pytorch_stargan                    FAIL
Running torchbench.py pytorch_struct...
ERROR:common:'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_struct/networks/NeuralCFG.py", line 49, in terms
    torch.einsum("vh,th->tv", self.word_emb, self.mlp1(self.term_emb))
 |   File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_struct/networks/NeuralCFG.py", line 77, in forward
    return terms(input), rules(batch), roots(batch)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 337, in <graph break in forward_and_backward_pass>
    self.grad_scaler.scale(loss).backward()
  File "/scratch/ezyang/work/pytorch/torch/_tensor.py", line 450, in backward
    torch.autograd.backward(
  File "/scratch/ezyang/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/scratch/ezyang/work/pytorch/torch/autograd/function.py", line 270, in apply
    return user_fn(self, *args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 558, in backward
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/optimizations/backends.py", line 555, in _wrapped_bw_compiler
    return disable(disable(bw_compiler)(*args, **kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 362, in bw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 199, in placeholder
    sizes, strides = self.static_sizes_strides(example)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 53, in static_sizes_strides
    size = [sympy.Integer(i) for i in ex.size()]
AttributeError: 'int' object has no attribute 'size'

While executing %sym_size : [#users=2] = placeholder[target=sym_size]
Original traceback:
Module stack: {}
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_struct/networks/NeuralCFG.py", line 49, in terms
    torch.einsum("vh,th->tv", self.word_emb, self.mlp1(self.term_emb))
 |   File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/pytorch_struct/networks/NeuralCFG.py", line 77, in forward
    return terms(input), rules(batch), roots(batch)

TorchDynamo optimized model failed to run because of following error
cuda train pytorch_struct                     FAIL
Running torchbench.py pytorch_unet...
ERROR:common:Failed running call_function <built-in function pad>(*(FakeTensor(FakeTensor(..., device='meta',
           size=(s0, s7, -4.0*s1 + 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 + 14.0, -4.0*s1 + 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 + 14.0),
           grad_fn=<UpsampleBilinear2DBackward1>), cuda:0), [(2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2, 2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - (2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2 - 7.0, (2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2, 2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - (2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2 - 7.0]), **{}):
cannot determine truth value of Relational
(scroll up for backtrace)
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 52, in _run_node
    return node.target(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_subclasses/fake_tensor.py", line 849, in __torch_dispatch__
    return decomposition_table[func](*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_refs/__init__.py", line 2561, in constant_pad_nd
    if pad[pad_idx] < 0:
  File "/scratch/ezyang/work/pytorch/torch/__init__.py", line 214, in __bool__
    return self.node.bool_()
  File "/scratch/ezyang/work/pytorch/torch/fx/experimental/symbolic_shapes.py", line 203, in bool_
    return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
  File "/scratch/ezyang/work/env/lib/python3.9/site-packages/sympy/core/relational.py", line 511, in __bool__
    raise TypeError("cannot determine truth value of Relational")
TypeError: cannot determine truth value of Relational

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 251, in catch_errors
    return callback(frame, cache_size)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 459, in _convert_frame
    result = inner_convert(frame, cache_size)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 112, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 341, in _convert_frame_assert
    return _compile(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 396, in _compile
    out_code = transform_code_object(code, transform)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/bytecode_transformation.py", line 341, in transform_code_object
    transformations(instructions, code_options)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/convert_frame.py", line 384, in transform
    tracer.run()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 1494, in run
    super().run()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 359, in run
    and self.step()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 329, in step
    getattr(self, inst.opname)(inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 181, in wrapper
    return inner_fn(self, inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 786, in CALL_FUNCTION
    self.call_function(fn, args, {})
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 271, in call_function
    self.push(fn.call_function(self, args, kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/nn_module.py", line 221, in call_function
    return tx.inline_user_function_return(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 300, in inline_user_function_return
    result = InliningInstructionTranslator.inline_call(self, fn, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 1566, in inline_call
    return cls.inline_call_(parent, func, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 1620, in inline_call_
    tracer.run()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 359, in run
    and self.step()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 329, in step
    getattr(self, inst.opname)(inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 181, in wrapper
    return inner_fn(self, inst)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 786, in CALL_FUNCTION
    self.call_function(fn, args, {})
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/symbolic_convert.py", line 271, in call_function
    self.push(fn.call_function(self, args, kwargs))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/torch.py", line 408, in call_function
    tensor_variable = TensorVariable.create(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 199, in create
    example_value = _get_fake_value(proxy.node, tx)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 130, in _get_fake_value
    return wrap_fake_exception(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 709, in wrap_fake_exception
    return fn()
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 131, in <lambda>
    lambda: _run_node(tx.output, node, args, kwargs, nnmodule)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/variables/tensor.py", line 61, in _run_node
    raise RuntimeError(
RuntimeError: Failed running call_function <built-in function pad>(*(FakeTensor(FakeTensor(..., device='meta',
           size=(s0, s7, -4.0*s1 + 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 + 14.0, -4.0*s1 + 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 + 14.0),
           grad_fn=<UpsampleBilinear2DBackward1>), cuda:0), [(2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2, 2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - (2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s3 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2 - 7.0, (2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2, 2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - (2.0*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 - 2.0*(-2*s1 + (-2*s1 + (-2*s1 + (-2*s1 + s2 + 4)//2 + 5)//2 + 5)//2 + 5)//2 - 7.0)//2 - 7.0]), **{}):
cannot determine truth value of Relational
(scroll up for backtrace)
TorchDynamo optimized model failed to run because of following error
cuda train pytorch_unet                       FAIL
Running torchbench.py resnet18...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/resnet.py", line 284, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/ys/cys3jrob4wbdly6mo5wv2bpqu36lalfg3tcid6gae26iq2dogaqe.py", line 791, in call
    return (buf44, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_63, primals_64, primals_66, primals_67, primals_69, primals_70, primals_72, primals_73, primals_75, primals_76, primals_78, primals_79, primals_81, primals_82, primals_84, primals_85, primals_87, primals_88, primals_90, primals_91, primals_93, primals_94, primals_96, primals_97, primals_99, primals_100, primals_102, primals_103, primals_105, primals_106, primals_108, primals_109, primals_111, primals_112, primals_114, primals_115, primals_117, primals_118, primals_120, primals_121, primals_123, buf0, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf17, buf18, buf19, buf20, buf21, buf22, buf23, buf24, buf25, buf27, buf28, buf29, buf30, buf31, buf32, buf33, buf34, buf35, buf37, buf38, buf39, buf40, as_strided(buf43, (2, 512), (512, 1)), as_strided(primals_61, (1000, 512), (512, 1)), buf45, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train resnet18                           FAIL
Running torchbench.py resnet50...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/resnet.py", line 284, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/bx/cbxzj3vsscgkrprjh6nf3ai7tocnnolptw6k4qba4hqgiqww7lwc.py", line 1152, in call
    return (buf110, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_157, primals_158, primals_162, primals_163, primals_165, primals_166, primals_168, primals_169, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, primals_316, primals_318, primals_319, primals_321, buf0, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf18, buf19, buf20, buf21, buf22, buf23, buf24, buf25, buf26, buf27, buf28, buf29, buf31, buf32, buf33, buf34, buf35, buf36, buf37, buf38, buf39, buf40, buf41, buf42, buf43, buf44, buf45, buf46, buf47, buf48, buf49, buf50, buf51, buf52, buf53, buf54, buf55, buf57, buf58, buf59, buf60, buf61, buf62, buf63, buf64, buf65, buf66, buf67, buf68, buf69, buf70, buf71, buf72, buf73, buf74, buf75, buf76, buf77, buf78, buf79, buf80, buf81, buf82, buf83, buf84, buf85, buf86, buf87, buf88, buf89, buf90, buf91, buf92, buf93, buf95, buf96, buf97, buf98, buf99, buf100, buf101, buf102, buf103, buf104, buf105, buf106, as_strided(buf109, (2, 2048), (2048, 1)), as_strided(primals_160, (1000, 2048), (2048, 1)), buf111, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train resnet50                           FAIL
Running torchbench.py resnet50_quantized_qat...
WARNING:common:fp64 golden ref were not generated for resnet50_quantized_qat
[2022-11-06 03:31:17,609] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,620] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,638] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,662] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,671] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,688] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,698] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,713] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,721] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,737] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,744] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,759] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,766] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,774] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,790] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,797] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,813] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,820] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,835] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,842] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,849] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,865] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,872] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,888] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,896] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,911] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,917] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,925] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,942] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,950] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,968] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,976] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:17,993] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,000] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,015] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,021] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,029] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,045] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,052] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,068] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,075] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,090] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,096] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,104] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,119] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,127] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,142] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,150] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,164] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,171] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,178] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,194] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,201] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,217] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,224] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,239] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,245] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,252] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,269] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,278] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,295] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,302] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,318] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,325] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,340] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,346] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,354] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,370] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,377] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,393] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,400] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,415] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,421] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,429] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,445] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,453] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,468] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,475] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,490] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,497] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,504] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,520] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,527] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,543] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,550] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,565] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,572] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,579] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,594] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,602] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,617] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,624] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,640] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,646] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,653] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,669] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,676] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,692] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,699] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,714] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,721] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,728] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,744] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,753] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,769] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,777] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,793] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,800] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,816] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,822] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,830] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,845] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,853] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,869] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,876] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,891] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,897] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,905] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,920] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,928] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,943] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,950] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,965] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,972] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,978] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,983] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,988] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
[2022-11-06 03:31:18,993] torch._inductor.ir: [WARNING] Using FallbackKernel: aten._fused_moving_avg_obs_fq_helper_functional
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/fx/graph_module.py", line 660, in call_wrapped
    return self._wrapped_call(self, *args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/fx/graph_module.py", line 279, in __call__
    raise e
  File "/scratch/ezyang/work/pytorch/torch/fx/graph_module.py", line 269, in __call__
    return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "<eval_with_key>.8", line 4, in forward
    def forward(self, x : torch.Tensor) -> torch.Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/vi/cvilyeiqsp7ivdfcj2cw6in2synczfq2kvep2uohh6hwn5xv5wqy.py", line 3577, in call
    return (buf1123, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_157, primals_158, primals_169, primals_170, primals_186, primals_187, primals_203, primals_204, primals_220, primals_221, primals_237, primals_238, primals_261, primals_262, primals_278, primals_279, primals_295, primals_296, primals_319, primals_320, primals_336, primals_337, primals_353, primals_354, primals_377, primals_378, primals_394, primals_395, primals_411, primals_412, primals_428, primals_429, primals_452, primals_453, primals_469, primals_470, primals_486, primals_487, primals_510, primals_511, primals_527, primals_528, primals_544, primals_545, primals_568, primals_569, primals_585, primals_586, primals_602, primals_603, primals_626, primals_627, primals_643, primals_644, primals_660, primals_661, primals_677, primals_678, primals_701, primals_702, primals_718, primals_719, primals_735, primals_736, primals_759, primals_760, primals_776, primals_777, primals_793, primals_794, primals_817, primals_818, primals_834, primals_835, primals_851, primals_852, primals_875, primals_876, primals_892, primals_893, primals_909, primals_910, primals_933, primals_934, primals_950, primals_951, primals_967, primals_968, primals_991, primals_992, primals_1008, primals_1009, primals_1025, primals_1026, primals_1042, primals_1043, primals_1066, primals_1067, primals_1083, primals_1084, primals_1100, primals_1101, primals_1124, primals_1125, primals_1141, primals_1142, primals_1158, primals_1159, buf1, buf9, buf10, buf16, buf19, buf20, buf26, buf28, buf29, buf36, buf37, buf43, buf46, buf47, buf54, buf55, buf61, buf64, buf65, buf72, buf73, buf79, buf83, buf90, buf91, buf97, buf101, buf108, buf109, buf116, buf117, buf123, buf126, buf127, buf134, buf135, buf141, buf144, buf145, buf152, buf153, buf159, buf163, buf170, buf171, buf178, buf179, buf185, buf188, buf189, buf196, buf197, buf203, buf206, buf207, buf214, buf215, buf221, buf225, buf232, buf233, buf240, buf241, buf247, buf250, buf251, buf258, buf259, buf265, buf268, buf269, buf276, buf277, buf283, buf287, buf294, buf295, buf301, buf305, buf312, buf313, buf320, buf321, buf327, buf330, buf331, buf338, buf339, buf345, buf348, buf349, buf356, buf357, buf363, buf367, buf374, buf375, buf382, buf383, buf389, buf392, buf393, buf400, buf401, buf407, buf410, buf411, buf418, buf419, buf425, buf429, buf436, buf437, buf444, buf445, buf451, buf454, buf455, buf462, buf463, buf469, buf472, buf473, buf480, buf481, buf487, buf491, buf498, buf499, buf506, buf507, buf513, buf516, buf517, buf524, buf525, buf531, buf534, buf535, buf542, buf543, buf549, buf553, buf560, buf561, buf567, buf571, buf578, buf579, buf586, buf587, buf593, buf596, buf597, buf604, buf605, buf611, buf614, buf615, buf622, buf623, buf629, buf633, buf640, buf641, buf648, buf649, buf655, buf658, buf659, buf666, buf667, buf673, buf676, buf677, buf684, buf685, buf691, buf695, buf702, buf703, buf710, buf711, buf717, buf720, buf721, buf728, buf729, buf735, buf738, buf739, buf746, buf747, buf753, buf757, buf764, buf765, buf772, buf773, buf779, buf782, buf783, buf790, buf791, buf797, buf800, buf801, buf808, buf809, buf815, buf819, buf826, buf827, buf834, buf835, buf841, buf844, buf845, buf852, buf853, buf859, buf862, buf863, buf870, buf871, buf877, buf881, buf888, buf889, buf896, buf897, buf903, buf906, buf907, buf914, buf915, buf921, buf924, buf925, buf932, buf933, buf939, buf943, buf950, buf951, buf957, buf961, buf968, buf969, buf976, buf977, buf983, buf986, buf987, buf994, buf995, buf1001, buf1004, buf1005, buf1012, buf1013, buf1019, buf1023, buf1030, buf1031, buf1038, buf1039, buf1045, buf1048, buf1049, buf1056, buf1057, buf1063, buf1066, buf1067, buf1074, buf1075, buf1081, buf1085, buf1093, buf1102, buf1108, buf1109, buf1116, buf1124, as_strided(buf1115, (1000, 2048), (2048, 1)), buf1129, buf1130, buf1131, buf1132, buf1133, buf1134, buf1135, buf1136, buf1137, buf1138, buf1139, buf1140, buf1141, buf1142, buf1143, buf1144, buf1145, buf1146, buf1147, buf1148, buf1149, buf1150, buf1151, buf1152, buf1153, buf1154, buf1155, buf1156, buf1157, buf1158, buf1159, buf1160, buf1161, buf1162, buf1163, buf1164, buf1165, buf1166, buf1167, buf1168, buf1169, buf1170, buf1171, buf1172, buf1173, buf1174, buf1175, buf1176, buf1177, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train resnet50_quantized_qat             FAIL
Running torchbench.py resnext50_32x4d...
ERROR:common:name 's0' is not defined
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/resnet.py", line 284, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 945, in new_func
    return compiled_fn(args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 321, in g
    return f(*args)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 573, in compiled_function
    return CompiledFunction.apply(*remove_dupe_args(args))
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 528, in forward
    fw_outs = call_func_with_args(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 346, in call_func_with_args
    out = normalize_as_list(f(args))
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 185, in run
    return model(new_inputs)
  File "/tmp/torchinductor_ezyang/q4/cq43f7zlzekuwaumjjv6oiy4arsha7x4mevyghtsthjg6lapcehm.py", line 1152, in call
    return (buf110, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_157, primals_158, primals_162, primals_163, primals_165, primals_166, primals_168, primals_169, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, primals_316, primals_318, primals_319, primals_321, buf0, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf11, buf12, buf13, buf14, buf15, buf16, buf17, buf18, buf19, buf20, buf21, buf22, buf23, buf24, buf25, buf26, buf27, buf28, buf29, buf31, buf32, buf33, buf34, buf35, buf36, buf37, buf38, buf39, buf40, buf41, buf42, buf43, buf44, buf45, buf46, buf47, buf48, buf49, buf50, buf51, buf52, buf53, buf54, buf55, buf57, buf58, buf59, buf60, buf61, buf62, buf63, buf64, buf65, buf66, buf67, buf68, buf69, buf70, buf71, buf72, buf73, buf74, buf75, buf76, buf77, buf78, buf79, buf80, buf81, buf82, buf83, buf84, buf85, buf86, buf87, buf88, buf89, buf90, buf91, buf92, buf93, buf95, buf96, buf97, buf98, buf99, buf100, buf101, buf102, buf103, buf104, buf105, buf106, as_strided(buf109, (2, 2048), (2048, 1)), as_strided(primals_160, (1000, 2048), (2048, 1)), buf111, s0, )
NameError: name 's0' is not defined
TorchDynamo optimized model failed to run because of following error
cuda train resnext50_32x4d                    FAIL
Running torchbench.py shufflenet_v2_x1_0...
ERROR:common:[TensorBox(StorageBox(
  MatrixMultiplyAdd(
    name=buf161,
    layout=FlexibleLayout('cuda', torch.float32, size=[2, 1000], stride=[1000, 1]),
    inputs=[InputBuffer(name='primals_170', layout=FixedLayout('cuda', torch.float32, size=[1000], stride=[1])), ComputedBuffer(name='buf160', layout=FlexibleLayout('cuda', torch.float32, size=[2, 1024], stride=[1024, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf159, i1 + 1024 * i0) / index_expr(49, torch.float32),
      ranges=[2, 1024],
      origins={mean}
    )), ReinterpretView(
      StorageBox(
        InputBuffer(name='primals_169', layout=FixedLayout('cuda', torch.float32, size=[1000, 1024], stride=[1024, 1]))
      ),
      FixedLayout('cuda', torch.float32, size=[1024, 1000], stride=[1, 1024]),
      origins={permute_16}
    )],
    constant_args=(),
    kwargs={'beta': 1, 'alpha': 1},
    output_view=None,
    origins={addmm}
  )
)), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[24, 3, 3, 3], stride=[27, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_2', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[24, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_5', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_8', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_11', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_14', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_17', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_20', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_23', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_26', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_29', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_32', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_35', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_38', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_41', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_44', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_47', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_50', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_52', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_53', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_55', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_56', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_58', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_59', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_61', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_62', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_64', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_65', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_67', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_68', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_70', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_71', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_73', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_74', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_76', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_77', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_79', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_80', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_82', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_83', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_85', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_86', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_88', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_89', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_91', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_92', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_94', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_95', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_97', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_98', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_100', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_101', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_103', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_104', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_106', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_107', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_109', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_110', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_112', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_113', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_115', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_116', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_118', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_119', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_121', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_122', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_124', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_125', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_127', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_128', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_130', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_131', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_133', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_134', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_136', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_137', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_139', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_140', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_142', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_143', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_145', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_146', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_148', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_149', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_151', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_152', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_154', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_155', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_157', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_158', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_160', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_161', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_163', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_164', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_166', layout=FixedLayout('cuda', torch.float32, size=[1024, 464, 1, 1], stride=[464, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_167', layout=FixedLayout('cuda', torch.float32, size=[1024], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_171', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_172', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_174', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_175', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_177', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_178', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_180', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_181', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_183', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_184', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_186', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_187', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_189', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_190', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_192', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_193', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_195', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_196', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_198', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_199', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_201', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_202', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_204', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_205', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_207', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_208', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_210', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_211', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_213', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_214', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_216', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_217', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_219', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_220', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_222', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_223', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_225', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_226', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_228', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_229', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_231', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_232', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_234', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_235', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_237', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_238', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_240', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_241', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_243', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_244', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_246', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_247', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_249', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_250', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_252', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_253', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_255', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_256', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_258', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_259', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_261', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_262', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_264', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_265', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_267', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_268', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_270', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_271', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_273', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_274', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_276', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_277', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_279', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_280', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_282', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_283', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_285', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_286', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_288', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_289', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_291', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_292', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_294', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_295', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_297', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_298', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_300', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_301', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_303', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_304', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_306', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_307', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_309', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_310', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_312', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_313', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_315', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_316', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_318', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_319', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_321', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_322', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_324', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_325', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_327', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_328', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_330', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_331', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_333', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_334', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_336', layout=FixedLayout('cuda', torch.float32, size=[1024], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_337', layout=FixedLayout('cuda', torch.float32, size=[1024], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_339', layout=FixedLayout('cuda', torch.float32, size=[s0, 3, s2, s2], stride=[3*s2**2, s2**2, s2, 1]))
)), TensorBox(StorageBox(
  Convolution(
    name=buf0,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 24, 112, 112]), stride=[301056, 12544, 112, 1]),
    inputs=[InputBuffer(name='primals_339', layout=FixedLayout('cuda', torch.float32, size=[s0, 3, s2, s2], stride=[3*s2**2, s2**2, s2, 1])), InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[24, 3, 3, 3], stride=[27, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf1', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 24, 112, 112]), stride=[301056, 12544, 112, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf0, i3 + 112 * i2 + 12544 * i1 + 301056 * i0) - load(primals_171, i1) * reciprocal(sqrt(load(primals_172, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_2, i1) + load(primals_3, i1)),
    ranges=torch.Size([2, 24, 112, 112]),
    origins={relu}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))))),
    ranges=[2, 24, 56, 56],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf3', layout=FlexibleLayout('cuda', torch.int64, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)))))))), index_expr(113 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))), index_expr(112 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)))))), index_expr(111 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))), index_expr(1 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)))), index_expr(2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))), index_expr(-1 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)), index_expr(-111 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), index_expr(-112 + 2 * i3 + 224 * i2, torch.int64), index_expr(-113 + 2 * i3 + 224 * i2, torch.int64))))))))),
    ranges=[2, 24, 56, 56],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf4,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 24, 28, 28]), stride=[18816, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))))),
      ranges=[2, 24, 56, 56],
      origins={max_pool2d_with_indices}
    )), InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[24, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 24),
    kwargs={},
    output_view=None,
    origins={convolution_1}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 24, 28, 28]), stride=[18816, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf4, i3 + 28 * i2 + 784 * i1 + 18816 * i0) - load(primals_174, i1) * reciprocal(sqrt(load(primals_175, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_5, i1) + load(primals_6, i1),
    ranges=torch.Size([2, 24, 28, 28]),
    origins={add_3}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf6,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 24, 28, 28]), stride=[18816, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf4, i3 + 28 * i2 + 784 * i1 + 18816 * i0) - load(primals_174, i1) * reciprocal(sqrt(load(primals_175, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_5, i1) + load(primals_6, i1),
      ranges=torch.Size([2, 24, 28, 28]),
      origins={add_3}
    )), InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_2}
  )
)), TensorBox(StorageBox(
  Convolution(
    name=buf8,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 56, 56]), stride=[181888, 3136, 56, 1]),
    inputs=[ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))))),
      ranges=[2, 24, 56, 56],
      origins={max_pool2d_with_indices}
    )), InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_3}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 56, 56]), stride=[181888, 3136, 56, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf8, i3 + 56 * i2 + 3136 * i1 + 181888 * i0) - load(primals_180, i1) * reciprocal(sqrt(load(primals_181, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_11, i1) + load(primals_12, i1)),
    ranges=torch.Size([2, 58, 56, 56]),
    origins={relu_2}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf10,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 56, 56]), stride=[181888, 3136, 56, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf8, i3 + 56 * i2 + 3136 * i1 + 181888 * i0) - load(primals_180, i1) * reciprocal(sqrt(load(primals_181, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_11, i1) + load(primals_12, i1)),
      ranges=torch.Size([2, 58, 56, 56]),
      origins={relu_2}
    )), InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_4}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf11', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf10, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_183, i1) * reciprocal(sqrt(load(primals_184, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_14, i1) + load(primals_15, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_9}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf12,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf11', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf10, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_183, i1) * reciprocal(sqrt(load(primals_184, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_14, i1) + load(primals_15, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_9}
    )), InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_5}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf15', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf14, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone}
        ))
      ),
      size=(2, 116, 28, 28),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_1}
    ),
    size=[2, 58, 28, 28],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
    origins={split}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf16,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf15', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf14, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
            ranges=[2, 58, 2, 28, 28],
            origins={clone}
          ))
        ),
        size=(2, 116, 28, 28),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_1}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 58, 28, 28], stride=[90944, 784, 28, 1], offset=45472),
      origins={convolution_6}
    ), InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_6}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf16, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_189, i1) * reciprocal(sqrt(load(primals_190, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_20, i1) + load(primals_21, i1)),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={relu_4}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf18,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf16, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_189, i1) * reciprocal(sqrt(load(primals_190, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_20, i1) + load(primals_21, i1)),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={relu_4}
    )), InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_7}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf19', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf18, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_192, i1) * reciprocal(sqrt(load(primals_193, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_23, i1) + load(primals_24, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_15}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf20,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf19', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf18, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_192, i1) * reciprocal(sqrt(load(primals_193, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_23, i1) + load(primals_24, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_15}
    )), InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_8}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf24', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf23, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_1}
        ))
      ),
      size=(2, 116, 28, 28),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_3}
    ),
    size=[2, 58, 28, 28],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
    origins={split_1}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf25,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf24', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf23, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
            ranges=[2, 58, 2, 28, 28],
            origins={clone_1}
          ))
        ),
        size=(2, 116, 28, 28),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_3}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 58, 28, 28], stride=[90944, 784, 28, 1], offset=45472),
      origins={convolution_9}
    ), InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_9}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf26', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf25, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_198, i1) * reciprocal(sqrt(load(primals_199, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_29, i1) + load(primals_30, i1)),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={relu_6}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf27,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf26', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf25, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_198, i1) * reciprocal(sqrt(load(primals_199, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_29, i1) + load(primals_30, i1)),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={relu_6}
    )), InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_10}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf28', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf27, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_201, i1) * reciprocal(sqrt(load(primals_202, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_32, i1) + load(primals_33, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_21}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf29,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf28', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf27, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_201, i1) * reciprocal(sqrt(load(primals_202, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_32, i1) + load(primals_33, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_21}
    )), InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_11}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf33', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf32, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_2}
        ))
      ),
      size=(2, 116, 28, 28),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_5}
    ),
    size=[2, 58, 28, 28],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
    origins={split_2}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf34,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf33', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf32, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
            ranges=[2, 58, 2, 28, 28],
            origins={clone_2}
          ))
        ),
        size=(2, 116, 28, 28),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_5}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 58, 28, 28], stride=[90944, 784, 28, 1], offset=45472),
      origins={convolution_12}
    ), InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_12}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf35', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf34, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_207, i1) * reciprocal(sqrt(load(primals_208, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_38, i1) + load(primals_39, i1)),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={relu_8}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf36,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf35', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf34, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_207, i1) * reciprocal(sqrt(load(primals_208, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_38, i1) + load(primals_39, i1)),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={relu_8}
    )), InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_13}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf36, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_210, i1) * reciprocal(sqrt(load(primals_211, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_41, i1) + load(primals_42, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_27}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf38,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf36, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_210, i1) * reciprocal(sqrt(load(primals_211, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_41, i1) + load(primals_42, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_27}
    )), InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_14}
  )
)), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        load(buf41, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
        ranges=[2, 58, 2, 28, 28],
        origins={clone_3}
      ))
    ),
    size=(2, 116, 28, 28),
    reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
    origins={view_7}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf43,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf41, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_3}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 116, 28, 28), stride=[90944, 784, 28, 1]),
      origins={convolution_15}
    ), InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_15}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf43, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_216, i1) * reciprocal(sqrt(load(primals_217, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_47, i1) + load(primals_48, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_31}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf45,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf43, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_216, i1) * reciprocal(sqrt(load(primals_217, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_47, i1) + load(primals_48, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_31}
    )), InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_16}
  )
)), TensorBox(StorageBox(
  Convolution(
    name=buf47,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 28, 28]), stride=[90944, 784, 28, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf41, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_3}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 116, 28, 28), stride=[90944, 784, 28, 1]),
      origins={convolution_17}
    ), InputBuffer(name='primals_52', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_17}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf48', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 28, 28]), stride=[90944, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf47, i3 + 28 * i2 + 784 * i1 + 90944 * i0) - load(primals_222, i1) * reciprocal(sqrt(load(primals_223, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_53, i1) + load(primals_54, i1)),
    ranges=torch.Size([2, 116, 28, 28]),
    origins={relu_11}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf49,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf48', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 28, 28]), stride=[90944, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf47, i3 + 28 * i2 + 784 * i1 + 90944 * i0) - load(primals_222, i1) * reciprocal(sqrt(load(primals_223, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_53, i1) + load(primals_54, i1)),
      ranges=torch.Size([2, 116, 28, 28]),
      origins={relu_11}
    )), InputBuffer(name='primals_55', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_18}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf50', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf49, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_225, i1) * reciprocal(sqrt(load(primals_226, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_56, i1) + load(primals_57, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_37}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf51,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf50', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf49, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_225, i1) * reciprocal(sqrt(load(primals_226, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_56, i1) + load(primals_57, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_37}
    )), InputBuffer(name='primals_58', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_19}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf54', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf53, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_4}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_9}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_3}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf55,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf54', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf53, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_4}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_9}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_20}
    ), InputBuffer(name='primals_61', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_20}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf56', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf55, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_231, i1) * reciprocal(sqrt(load(primals_232, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_62, i1) + load(primals_63, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_13}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf57,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf56', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf55, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_231, i1) * reciprocal(sqrt(load(primals_232, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_62, i1) + load(primals_63, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_13}
    )), InputBuffer(name='primals_64', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_21}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf58', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf57, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_234, i1) * reciprocal(sqrt(load(primals_235, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_65, i1) + load(primals_66, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_43}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf59,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf58', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf57, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_234, i1) * reciprocal(sqrt(load(primals_235, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_65, i1) + load(primals_66, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_43}
    )), InputBuffer(name='primals_67', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_22}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf63', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf62, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_5}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_11}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_4}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf64,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf63', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf62, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_5}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_11}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_23}
    ), InputBuffer(name='primals_70', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_23}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf65', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf64, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_240, i1) * reciprocal(sqrt(load(primals_241, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_71, i1) + load(primals_72, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_15}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf66,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf65', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf64, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_240, i1) * reciprocal(sqrt(load(primals_241, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_71, i1) + load(primals_72, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_15}
    )), InputBuffer(name='primals_73', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_24}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf66, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_243, i1) * reciprocal(sqrt(load(primals_244, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_74, i1) + load(primals_75, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_49}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf68,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf66, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_243, i1) * reciprocal(sqrt(load(primals_244, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_74, i1) + load(primals_75, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_49}
    )), InputBuffer(name='primals_76', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_25}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf71, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_6}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_13}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_5}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf73,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf71, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_6}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_13}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_26}
    ), InputBuffer(name='primals_79', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_26}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf73, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_249, i1) * reciprocal(sqrt(load(primals_250, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_80, i1) + load(primals_81, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_17}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf75,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf73, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_249, i1) * reciprocal(sqrt(load(primals_250, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_80, i1) + load(primals_81, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_17}
    )), InputBuffer(name='primals_82', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_27}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf76', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf75, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_252, i1) * reciprocal(sqrt(load(primals_253, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_83, i1) + load(primals_84, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_55}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf77,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf76', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf75, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_252, i1) * reciprocal(sqrt(load(primals_253, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_83, i1) + load(primals_84, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_55}
    )), InputBuffer(name='primals_85', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_28}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf81', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf80, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_7}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_15}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_6}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf82,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf81', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf80, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_7}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_15}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_29}
    ), InputBuffer(name='primals_88', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_29}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf83', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf82, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_258, i1) * reciprocal(sqrt(load(primals_259, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_89, i1) + load(primals_90, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_19}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf84,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf83', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf82, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_258, i1) * reciprocal(sqrt(load(primals_259, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_89, i1) + load(primals_90, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_19}
    )), InputBuffer(name='primals_91', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_30}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf85', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf84, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_261, i1) * reciprocal(sqrt(load(primals_262, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_92, i1) + load(primals_93, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_61}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf86,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf85', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf84, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_261, i1) * reciprocal(sqrt(load(primals_262, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_92, i1) + load(primals_93, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_61}
    )), InputBuffer(name='primals_94', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_31}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf90', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf89, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_8}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_17}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_7}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf91,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf90', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf89, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_8}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_17}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_32}
    ), InputBuffer(name='primals_97', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_32}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf92', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf91, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_267, i1) * reciprocal(sqrt(load(primals_268, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_98, i1) + load(primals_99, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_21}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf93,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf92', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf91, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_267, i1) * reciprocal(sqrt(load(primals_268, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_98, i1) + load(primals_99, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_21}
    )), InputBuffer(name='primals_100', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_33}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf94', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf93, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_270, i1) * reciprocal(sqrt(load(primals_271, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_101, i1) + load(primals_102, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_67}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf95,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf94', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf93, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_270, i1) * reciprocal(sqrt(load(primals_271, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_101, i1) + load(primals_102, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_67}
    )), InputBuffer(name='primals_103', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_34}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf99', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf98, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_9}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_19}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_8}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf100,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf99', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf98, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_9}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_19}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_35}
    ), InputBuffer(name='primals_106', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_35}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf101', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf100, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_276, i1) * reciprocal(sqrt(load(primals_277, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_107, i1) + load(primals_108, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_23}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf102,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf101', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf100, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_276, i1) * reciprocal(sqrt(load(primals_277, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_107, i1) + load(primals_108, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_23}
    )), InputBuffer(name='primals_109', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_36}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf103', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf102, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_279, i1) * reciprocal(sqrt(load(primals_280, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_110, i1) + load(primals_111, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_73}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf104,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf103', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf102, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_279, i1) * reciprocal(sqrt(load(primals_280, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_110, i1) + load(primals_111, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_73}
    )), InputBuffer(name='primals_112', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_37}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf108', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf107, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_10}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_21}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_9}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf109,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf108', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf107, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_10}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_21}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_38}
    ), InputBuffer(name='primals_115', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_38}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf110', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf109, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_285, i1) * reciprocal(sqrt(load(primals_286, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_116, i1) + load(primals_117, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_25}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf111,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf110', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf109, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_285, i1) * reciprocal(sqrt(load(primals_286, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_116, i1) + load(primals_117, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_25}
    )), InputBuffer(name='primals_118', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_39}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf112', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf111, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_288, i1) * reciprocal(sqrt(load(primals_289, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_119, i1) + load(primals_120, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_79}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf113,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf112', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf111, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_288, i1) * reciprocal(sqrt(load(primals_289, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_119, i1) + load(primals_120, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_79}
    )), InputBuffer(name='primals_121', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_40}
  )
)), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf117', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        load(buf116, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
        ranges=[2, 116, 2, 14, 14],
        origins={clone_11}
      ))
    ),
    size=(2, 232, 14, 14),
    reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
    origins={view_23}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf118,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf117', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf116, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_11}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 232, 14, 14), stride=[45472, 196, 14, 1]),
      origins={convolution_41}
    ), InputBuffer(name='primals_124', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_41}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf119', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf118, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_294, i1) * reciprocal(sqrt(load(primals_295, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_125, i1) + load(primals_126, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_83}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf120,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf119', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf118, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_294, i1) * reciprocal(sqrt(load(primals_295, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_125, i1) + load(primals_126, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_83}
    )), InputBuffer(name='primals_127', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_42}
  )
)), TensorBox(StorageBox(
  Convolution(
    name=buf122,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 14, 14]), stride=[45472, 196, 14, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf117', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf116, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_11}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 232, 14, 14), stride=[45472, 196, 14, 1]),
      origins={convolution_43}
    ), InputBuffer(name='primals_130', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_43}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf123', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 14, 14]), stride=[45472, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf122, i3 + 14 * i2 + 196 * i1 + 45472 * i0) - load(primals_300, i1) * reciprocal(sqrt(load(primals_301, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_131, i1) + load(primals_132, i1)),
    ranges=torch.Size([2, 232, 14, 14]),
    origins={relu_28}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf124,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf123', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 14, 14]), stride=[45472, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf122, i3 + 14 * i2 + 196 * i1 + 45472 * i0) - load(primals_300, i1) * reciprocal(sqrt(load(primals_301, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_131, i1) + load(primals_132, i1)),
      ranges=torch.Size([2, 232, 14, 14]),
      origins={relu_28}
    )), InputBuffer(name='primals_133', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_44}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf125', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf124, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_303, i1) * reciprocal(sqrt(load(primals_304, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_134, i1) + load(primals_135, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_89}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf126,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf125', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf124, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_303, i1) * reciprocal(sqrt(load(primals_304, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_134, i1) + load(primals_135, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_89}
    )), InputBuffer(name='primals_136', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_45}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf129', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf128, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_12}
        ))
      ),
      size=(2, 464, 7, 7),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_25}
    ),
    size=[2, 232, 7, 7],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 232, i2, i3],
    origins={split_10}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf130,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf129', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf128, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
            ranges=[2, 232, 2, 7, 7],
            origins={clone_12}
          ))
        ),
        size=(2, 464, 7, 7),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_25}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 232, 7, 7], stride=[22736, 49, 7, 1], offset=11368),
      origins={convolution_46}
    ), InputBuffer(name='primals_139', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_46}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf131', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf130, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_309, i1) * reciprocal(sqrt(load(primals_310, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_140, i1) + load(primals_141, i1)),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={relu_30}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf132,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf131', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf130, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_309, i1) * reciprocal(sqrt(load(primals_310, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_140, i1) + load(primals_141, i1)),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={relu_30}
    )), InputBuffer(name='primals_142', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_47}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf133', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf132, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_312, i1) * reciprocal(sqrt(load(primals_313, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_143, i1) + load(primals_144, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_95}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf134,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf133', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf132, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_312, i1) * reciprocal(sqrt(load(primals_313, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_143, i1) + load(primals_144, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_95}
    )), InputBuffer(name='primals_145', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_48}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf138', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf137, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_13}
        ))
      ),
      size=(2, 464, 7, 7),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_27}
    ),
    size=[2, 232, 7, 7],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 232, i2, i3],
    origins={split_11}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf139,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf138', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf137, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
            ranges=[2, 232, 2, 7, 7],
            origins={clone_13}
          ))
        ),
        size=(2, 464, 7, 7),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_27}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 232, 7, 7], stride=[22736, 49, 7, 1], offset=11368),
      origins={convolution_49}
    ), InputBuffer(name='primals_148', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_49}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf140', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf139, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_318, i1) * reciprocal(sqrt(load(primals_319, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_149, i1) + load(primals_150, i1)),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={relu_32}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf141,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf140', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf139, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_318, i1) * reciprocal(sqrt(load(primals_319, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_149, i1) + load(primals_150, i1)),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={relu_32}
    )), InputBuffer(name='primals_151', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_50}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf142', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf141, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_321, i1) * reciprocal(sqrt(load(primals_322, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_152, i1) + load(primals_153, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_101}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf143,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf142', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf141, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_321, i1) * reciprocal(sqrt(load(primals_322, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_152, i1) + load(primals_153, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_101}
    )), InputBuffer(name='primals_154', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_51}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf147', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf146, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_14}
        ))
      ),
      size=(2, 464, 7, 7),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_29}
    ),
    size=[2, 232, 7, 7],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 232, i2, i3],
    origins={split_12}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf148,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf147', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf146, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
            ranges=[2, 232, 2, 7, 7],
            origins={clone_14}
          ))
        ),
        size=(2, 464, 7, 7),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_29}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 232, 7, 7], stride=[22736, 49, 7, 1], offset=11368),
      origins={convolution_52}
    ), InputBuffer(name='primals_157', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_52}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf149', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf148, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_327, i1) * reciprocal(sqrt(load(primals_328, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_158, i1) + load(primals_159, i1)),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={relu_34}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf150,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf149', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf148, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_327, i1) * reciprocal(sqrt(load(primals_328, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_158, i1) + load(primals_159, i1)),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={relu_34}
    )), InputBuffer(name='primals_160', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_53}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf151', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf150, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_330, i1) * reciprocal(sqrt(load(primals_331, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_161, i1) + load(primals_162, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_107}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf152,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf151', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf150, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_330, i1) * reciprocal(sqrt(load(primals_331, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_161, i1) + load(primals_162, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_107}
    )), InputBuffer(name='primals_163', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_54}
  )
)), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf156', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        load(buf155, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
        ranges=[2, 232, 2, 7, 7],
        origins={clone_15}
      ))
    ),
    size=(2, 464, 7, 7),
    reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
    origins={view_31}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf157,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 1024, 7, 7]), stride=[50176, 49, 7, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf156', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf155, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_15}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 464, 7, 7), stride=[22736, 49, 7, 1]),
      origins={convolution_55}
    ), InputBuffer(name='primals_166', layout=FixedLayout('cuda', torch.float32, size=[1024, 464, 1, 1], stride=[464, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_55}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf160', layout=FlexibleLayout('cuda', torch.float32, size=[2, 1024], stride=[1024, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf159, i1 + 1024 * i0) / index_expr(49, torch.float32),
    ranges=[2, 1024],
    origins={mean}
  ))
)), TensorBox(
  ReinterpretView(
    StorageBox(
      InputBuffer(name='primals_169', layout=FixedLayout('cuda', torch.float32, size=[1000, 1024], stride=[1024, 1]))
    ),
    FixedLayout('cuda', torch.float32, size=[1000, 1024], stride=[1024, 1]),
    origins={permute_17}
  )
), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf158, i3 + 7 * i2 + 49 * i1 + 50176 * i0) <= constant(0, torch.float32),
    ranges=torch.Size([2, 1024, 7, 7]),
    origins={le}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf153, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_1}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf144, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_3}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf135, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_5}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf127, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_7}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf121, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_9}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf114, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_10}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf105, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_12}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf96, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_14}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf87, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_16}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf78, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_18}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf69, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_20}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf60, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_22}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf52, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_24}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf46, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_26}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf39, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_27}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf30, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_29}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf21, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_31}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf13, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_33}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf7, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_35}
  )
)), s0, 28, 28, 14, 14, 7, 7]

While executing return [addmm, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_157, primals_158, primals_160, primals_161, primals_163, primals_164, primals_166, primals_167, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, primals_316, primals_318, primals_319, primals_321, primals_322, primals_324, primals_325, primals_327, primals_328, primals_330, primals_331, primals_333, primals_334, primals_336, primals_337, primals_339, convolution, relu, getitem, getitem_1, convolution_1, add_3, convolution_2, convolution_3, relu_2, convolution_4, add_9, convolution_5, getitem_3, convolution_6, relu_4, convolution_7, add_15, convolution_8, getitem_5, convolution_9, relu_6, convolution_10, add_21, convolution_11, getitem_7, convolution_12, relu_8, convolution_13, add_27, convolution_14, view_7, convolution_15, add_31, convolution_16, convolution_17, relu_11, convolution_18, add_37, convolution_19, getitem_9, convolution_20, relu_13, convolution_21, add_43, convolution_22, getitem_11, convolution_23, relu_15, convolution_24, add_49, convolution_25, getitem_13, convolution_26, relu_17, convolution_27, add_55, convolution_28, getitem_15, convolution_29, relu_19, convolution_30, add_61, convolution_31, getitem_17, convolution_32, relu_21, convolution_33, add_67, convolution_34, getitem_19, convolution_35, relu_23, convolution_36, add_73, convolution_37, getitem_21, convolution_38, relu_25, convolution_39, add_79, convolution_40, view_23, convolution_41, add_83, convolution_42, convolution_43, relu_28, convolution_44, add_89, convolution_45, getitem_23, convolution_46, relu_30, convolution_47, add_95, convolution_48, getitem_25, convolution_49, relu_32, convolution_50, add_101, convolution_51, getitem_27, convolution_52, relu_34, convolution_53, add_107, convolution_54, view_31, convolution_55, mean, permute_17, le, le_1, le_3, le_5, le_7, le_9, le_10, le_12, le_14, le_16, le_18, le_20, le_22, le_24, le_26, le_27, le_29, le_31, le_33, le_35, sym_size, sym_size_1, sym_size_2, sym_size_3, sym_size_4, sym_size_5, sym_size_6]
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/shufflenetv2.py", line 165, in forward
    def forward(self, x: Tensor) -> Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 267, in output
    assert all(
AssertionError: [TensorBox(StorageBox(
  MatrixMultiplyAdd(
    name=buf161,
    layout=FlexibleLayout('cuda', torch.float32, size=[2, 1000], stride=[1000, 1]),
    inputs=[InputBuffer(name='primals_170', layout=FixedLayout('cuda', torch.float32, size=[1000], stride=[1])), ComputedBuffer(name='buf160', layout=FlexibleLayout('cuda', torch.float32, size=[2, 1024], stride=[1024, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf159, i1 + 1024 * i0) / index_expr(49, torch.float32),
      ranges=[2, 1024],
      origins={mean}
    )), ReinterpretView(
      StorageBox(
        InputBuffer(name='primals_169', layout=FixedLayout('cuda', torch.float32, size=[1000, 1024], stride=[1024, 1]))
      ),
      FixedLayout('cuda', torch.float32, size=[1024, 1000], stride=[1, 1024]),
      origins={permute_16}
    )],
    constant_args=(),
    kwargs={'beta': 1, 'alpha': 1},
    output_view=None,
    origins={addmm}
  )
)), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[24, 3, 3, 3], stride=[27, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_2', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[24, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_5', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_8', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_11', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_14', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_17', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_20', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_23', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_26', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_29', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_32', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_35', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_38', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_41', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_44', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_47', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_50', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_52', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_53', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_55', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_56', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_58', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_59', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_61', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_62', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_64', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_65', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_67', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_68', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_70', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_71', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_73', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_74', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_76', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_77', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_79', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_80', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_82', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_83', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_85', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_86', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_88', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_89', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_91', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_92', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_94', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_95', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_97', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_98', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_100', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_101', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_103', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_104', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_106', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_107', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_109', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_110', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_112', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_113', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_115', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_116', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_118', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_119', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_121', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_122', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_124', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_125', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_127', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_128', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_130', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_131', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_133', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_134', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_136', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_137', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_139', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_140', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_142', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_143', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_145', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_146', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_148', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_149', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_151', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_152', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_154', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_155', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_157', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_158', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_160', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_161', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_163', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_164', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_166', layout=FixedLayout('cuda', torch.float32, size=[1024, 464, 1, 1], stride=[464, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_167', layout=FixedLayout('cuda', torch.float32, size=[1024], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_171', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_172', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_174', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_175', layout=FixedLayout('cuda', torch.float32, size=[24], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_177', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_178', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_180', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_181', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_183', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_184', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_186', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_187', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_189', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_190', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_192', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_193', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_195', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_196', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_198', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_199', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_201', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_202', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_204', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_205', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_207', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_208', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_210', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_211', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_213', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_214', layout=FixedLayout('cuda', torch.float32, size=[58], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_216', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_217', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_219', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_220', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_222', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_223', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_225', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_226', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_228', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_229', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_231', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_232', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_234', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_235', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_237', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_238', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_240', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_241', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_243', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_244', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_246', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_247', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_249', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_250', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_252', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_253', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_255', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_256', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_258', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_259', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_261', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_262', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_264', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_265', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_267', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_268', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_270', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_271', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_273', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_274', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_276', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_277', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_279', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_280', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_282', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_283', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_285', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_286', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_288', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_289', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_291', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_292', layout=FixedLayout('cuda', torch.float32, size=[116], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_294', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_295', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_297', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_298', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_300', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_301', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_303', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_304', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_306', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_307', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_309', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_310', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_312', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_313', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_315', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_316', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_318', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_319', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_321', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_322', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_324', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_325', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_327', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_328', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_330', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_331', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_333', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_334', layout=FixedLayout('cuda', torch.float32, size=[232], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_336', layout=FixedLayout('cuda', torch.float32, size=[1024], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_337', layout=FixedLayout('cuda', torch.float32, size=[1024], stride=[1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_339', layout=FixedLayout('cuda', torch.float32, size=[s0, 3, s2, s2], stride=[3*s2**2, s2**2, s2, 1]))
)), TensorBox(StorageBox(
  Convolution(
    name=buf0,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 24, 112, 112]), stride=[301056, 12544, 112, 1]),
    inputs=[InputBuffer(name='primals_339', layout=FixedLayout('cuda', torch.float32, size=[s0, 3, s2, s2], stride=[3*s2**2, s2**2, s2, 1])), InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[24, 3, 3, 3], stride=[27, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf1', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 24, 112, 112]), stride=[301056, 12544, 112, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf0, i3 + 112 * i2 + 12544 * i1 + 301056 * i0) - load(primals_171, i1) * reciprocal(sqrt(load(primals_172, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_2, i1) + load(primals_3, i1)),
    ranges=torch.Size([2, 24, 112, 112]),
    origins={relu}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))))),
    ranges=[2, 24, 56, 56],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf3', layout=FlexibleLayout('cuda', torch.int64, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)))))))), index_expr(113 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))), index_expr(112 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)))))), index_expr(111 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))), index_expr(1 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)))), index_expr(2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))), index_expr(-1 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf)), index_expr(-111 + 2 * i3 + 224 * i2, torch.int64), where(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf) > masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), index_expr(-112 + 2 * i3 + 224 * i2, torch.int64), index_expr(-113 + 2 * i3 + 224 * i2, torch.int64))))))))),
    ranges=[2, 24, 56, 56],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf4,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 24, 28, 28]), stride=[18816, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))))),
      ranges=[2, 24, 56, 56],
      origins={max_pool2d_with_indices}
    )), InputBuffer(name='primals_4', layout=FixedLayout('cuda', torch.float32, size=[24, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 24),
    kwargs={},
    output_view=None,
    origins={convolution_1}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 24, 28, 28]), stride=[18816, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf4, i3 + 28 * i2 + 784 * i1 + 18816 * i0) - load(primals_174, i1) * reciprocal(sqrt(load(primals_175, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_5, i1) + load(primals_6, i1),
    ranges=torch.Size([2, 24, 28, 28]),
    origins={add_3}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf6,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 24, 28, 28]), stride=[18816, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf4, i3 + 28 * i2 + 784 * i1 + 18816 * i0) - load(primals_174, i1) * reciprocal(sqrt(load(primals_175, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_5, i1) + load(primals_6, i1),
      ranges=torch.Size([2, 24, 28, 28]),
      origins={add_3}
    )), InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_2}
  )
)), TensorBox(StorageBox(
  Convolution(
    name=buf8,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 56, 56]), stride=[181888, 3136, 56, 1]),
    inputs=[ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 24, 56, 56], stride=[75264, 3136, 56, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -1 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -111 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), maximum(masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -112 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf), masked(index_expr(-1 + 2 * i2, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i2, torch.int64) < index_expr(112, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) >= index_expr(0, torch.int64) & index_expr(-1 + 2 * i3, torch.int64) < index_expr(112, torch.int64), load(buf1, -113 + 2 * i3 + 224 * i2 + 12544 * i1 + 301056 * i0), -inf))))))))),
      ranges=[2, 24, 56, 56],
      origins={max_pool2d_with_indices}
    )), InputBuffer(name='primals_10', layout=FixedLayout('cuda', torch.float32, size=[58, 24, 1, 1], stride=[24, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_3}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 56, 56]), stride=[181888, 3136, 56, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf8, i3 + 56 * i2 + 3136 * i1 + 181888 * i0) - load(primals_180, i1) * reciprocal(sqrt(load(primals_181, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_11, i1) + load(primals_12, i1)),
    ranges=torch.Size([2, 58, 56, 56]),
    origins={relu_2}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf10,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf9', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 56, 56]), stride=[181888, 3136, 56, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf8, i3 + 56 * i2 + 3136 * i1 + 181888 * i0) - load(primals_180, i1) * reciprocal(sqrt(load(primals_181, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_11, i1) + load(primals_12, i1)),
      ranges=torch.Size([2, 58, 56, 56]),
      origins={relu_2}
    )), InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_4}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf11', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf10, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_183, i1) * reciprocal(sqrt(load(primals_184, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_14, i1) + load(primals_15, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_9}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf12,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf11', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf10, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_183, i1) * reciprocal(sqrt(load(primals_184, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_14, i1) + load(primals_15, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_9}
    )), InputBuffer(name='primals_16', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_5}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf15', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf14, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone}
        ))
      ),
      size=(2, 116, 28, 28),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_1}
    ),
    size=[2, 58, 28, 28],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
    origins={split}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf16,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf15', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf14, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
            ranges=[2, 58, 2, 28, 28],
            origins={clone}
          ))
        ),
        size=(2, 116, 28, 28),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_1}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 58, 28, 28], stride=[90944, 784, 28, 1], offset=45472),
      origins={convolution_6}
    ), InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_6}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf16, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_189, i1) * reciprocal(sqrt(load(primals_190, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_20, i1) + load(primals_21, i1)),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={relu_4}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf18,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf17', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf16, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_189, i1) * reciprocal(sqrt(load(primals_190, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_20, i1) + load(primals_21, i1)),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={relu_4}
    )), InputBuffer(name='primals_22', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_7}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf19', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf18, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_192, i1) * reciprocal(sqrt(load(primals_193, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_23, i1) + load(primals_24, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_15}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf20,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf19', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf18, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_192, i1) * reciprocal(sqrt(load(primals_193, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_23, i1) + load(primals_24, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_15}
    )), InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_8}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf24', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf23, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_1}
        ))
      ),
      size=(2, 116, 28, 28),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_3}
    ),
    size=[2, 58, 28, 28],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
    origins={split_1}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf25,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf24', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf23, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
            ranges=[2, 58, 2, 28, 28],
            origins={clone_1}
          ))
        ),
        size=(2, 116, 28, 28),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_3}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 58, 28, 28], stride=[90944, 784, 28, 1], offset=45472),
      origins={convolution_9}
    ), InputBuffer(name='primals_28', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_9}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf26', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf25, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_198, i1) * reciprocal(sqrt(load(primals_199, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_29, i1) + load(primals_30, i1)),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={relu_6}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf27,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf26', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf25, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_198, i1) * reciprocal(sqrt(load(primals_199, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_29, i1) + load(primals_30, i1)),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={relu_6}
    )), InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_10}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf28', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf27, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_201, i1) * reciprocal(sqrt(load(primals_202, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_32, i1) + load(primals_33, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_21}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf29,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf28', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf27, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_201, i1) * reciprocal(sqrt(load(primals_202, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_32, i1) + load(primals_33, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_21}
    )), InputBuffer(name='primals_34', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_11}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf33', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf32, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_2}
        ))
      ),
      size=(2, 116, 28, 28),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_5}
    ),
    size=[2, 58, 28, 28],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3],
    origins={split_2}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf34,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf33', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf32, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
            ranges=[2, 58, 2, 28, 28],
            origins={clone_2}
          ))
        ),
        size=(2, 116, 28, 28),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_5}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 58, 28, 28], stride=[90944, 784, 28, 1], offset=45472),
      origins={convolution_12}
    ), InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_12}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf35', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf34, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_207, i1) * reciprocal(sqrt(load(primals_208, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_38, i1) + load(primals_39, i1)),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={relu_8}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf36,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf35', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf34, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_207, i1) * reciprocal(sqrt(load(primals_208, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_38, i1) + load(primals_39, i1)),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={relu_8}
    )), InputBuffer(name='primals_40', layout=FixedLayout('cuda', torch.float32, size=[58, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 58),
    kwargs={},
    output_view=None,
    origins={convolution_13}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf36, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_210, i1) * reciprocal(sqrt(load(primals_211, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_41, i1) + load(primals_42, i1),
    ranges=torch.Size([2, 58, 28, 28]),
    origins={add_27}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf38,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]),
    inputs=[ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 58, 28, 28]), stride=[45472, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf36, i3 + 28 * i2 + 784 * i1 + 45472 * i0) - load(primals_210, i1) * reciprocal(sqrt(load(primals_211, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_41, i1) + load(primals_42, i1),
      ranges=torch.Size([2, 58, 28, 28]),
      origins={add_27}
    )), InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[58, 58, 1, 1], stride=[58, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_14}
  )
)), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        load(buf41, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
        ranges=[2, 58, 2, 28, 28],
        origins={clone_3}
      ))
    ),
    size=(2, 116, 28, 28),
    reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3],
    origins={view_7}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf43,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf41, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_3}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 116, 28, 28), stride=[90944, 784, 28, 1]),
      origins={convolution_15}
    ), InputBuffer(name='primals_46', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_15}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf43, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_216, i1) * reciprocal(sqrt(load(primals_217, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_47, i1) + load(primals_48, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_31}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf45,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf43, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_216, i1) * reciprocal(sqrt(load(primals_217, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_47, i1) + load(primals_48, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_31}
    )), InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_16}
  )
)), TensorBox(StorageBox(
  Convolution(
    name=buf47,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 28, 28]), stride=[90944, 784, 28, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf41, i4 + 28 * i3 + 784 * i1 + 45472 * i2 + 90944 * i0),
          ranges=[2, 58, 2, 28, 28],
          origins={clone_3}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 116, 28, 28), stride=[90944, 784, 28, 1]),
      origins={convolution_17}
    ), InputBuffer(name='primals_52', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_17}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf48', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 28, 28]), stride=[90944, 784, 28, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf47, i3 + 28 * i2 + 784 * i1 + 90944 * i0) - load(primals_222, i1) * reciprocal(sqrt(load(primals_223, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_53, i1) + load(primals_54, i1)),
    ranges=torch.Size([2, 116, 28, 28]),
    origins={relu_11}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf49,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf48', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 28, 28]), stride=[90944, 784, 28, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf47, i3 + 28 * i2 + 784 * i1 + 90944 * i0) - load(primals_222, i1) * reciprocal(sqrt(load(primals_223, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_53, i1) + load(primals_54, i1)),
      ranges=torch.Size([2, 116, 28, 28]),
      origins={relu_11}
    )), InputBuffer(name='primals_55', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_18}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf50', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf49, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_225, i1) * reciprocal(sqrt(load(primals_226, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_56, i1) + load(primals_57, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_37}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf51,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf50', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf49, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_225, i1) * reciprocal(sqrt(load(primals_226, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_56, i1) + load(primals_57, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_37}
    )), InputBuffer(name='primals_58', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_19}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf54', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf53, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_4}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_9}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_3}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf55,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf54', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf53, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_4}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_9}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_20}
    ), InputBuffer(name='primals_61', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_20}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf56', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf55, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_231, i1) * reciprocal(sqrt(load(primals_232, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_62, i1) + load(primals_63, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_13}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf57,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf56', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf55, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_231, i1) * reciprocal(sqrt(load(primals_232, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_62, i1) + load(primals_63, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_13}
    )), InputBuffer(name='primals_64', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_21}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf58', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf57, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_234, i1) * reciprocal(sqrt(load(primals_235, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_65, i1) + load(primals_66, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_43}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf59,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf58', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf57, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_234, i1) * reciprocal(sqrt(load(primals_235, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_65, i1) + load(primals_66, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_43}
    )), InputBuffer(name='primals_67', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_22}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf63', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf62, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_5}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_11}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_4}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf64,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf63', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf62, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_5}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_11}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_23}
    ), InputBuffer(name='primals_70', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_23}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf65', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf64, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_240, i1) * reciprocal(sqrt(load(primals_241, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_71, i1) + load(primals_72, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_15}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf66,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf65', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf64, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_240, i1) * reciprocal(sqrt(load(primals_241, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_71, i1) + load(primals_72, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_15}
    )), InputBuffer(name='primals_73', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_24}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf66, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_243, i1) * reciprocal(sqrt(load(primals_244, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_74, i1) + load(primals_75, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_49}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf68,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf67', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf66, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_243, i1) * reciprocal(sqrt(load(primals_244, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_74, i1) + load(primals_75, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_49}
    )), InputBuffer(name='primals_76', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_25}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf71, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_6}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_13}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_5}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf73,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf72', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf71, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_6}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_13}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_26}
    ), InputBuffer(name='primals_79', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_26}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf73, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_249, i1) * reciprocal(sqrt(load(primals_250, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_80, i1) + load(primals_81, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_17}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf75,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf74', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf73, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_249, i1) * reciprocal(sqrt(load(primals_250, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_80, i1) + load(primals_81, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_17}
    )), InputBuffer(name='primals_82', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_27}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf76', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf75, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_252, i1) * reciprocal(sqrt(load(primals_253, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_83, i1) + load(primals_84, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_55}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf77,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf76', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf75, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_252, i1) * reciprocal(sqrt(load(primals_253, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_83, i1) + load(primals_84, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_55}
    )), InputBuffer(name='primals_85', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_28}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf81', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf80, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_7}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_15}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_6}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf82,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf81', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf80, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_7}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_15}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_29}
    ), InputBuffer(name='primals_88', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_29}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf83', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf82, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_258, i1) * reciprocal(sqrt(load(primals_259, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_89, i1) + load(primals_90, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_19}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf84,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf83', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf82, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_258, i1) * reciprocal(sqrt(load(primals_259, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_89, i1) + load(primals_90, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_19}
    )), InputBuffer(name='primals_91', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_30}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf85', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf84, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_261, i1) * reciprocal(sqrt(load(primals_262, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_92, i1) + load(primals_93, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_61}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf86,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf85', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf84, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_261, i1) * reciprocal(sqrt(load(primals_262, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_92, i1) + load(primals_93, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_61}
    )), InputBuffer(name='primals_94', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_31}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf90', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf89, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_8}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_17}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_7}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf91,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf90', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf89, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_8}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_17}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_32}
    ), InputBuffer(name='primals_97', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_32}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf92', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf91, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_267, i1) * reciprocal(sqrt(load(primals_268, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_98, i1) + load(primals_99, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_21}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf93,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf92', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf91, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_267, i1) * reciprocal(sqrt(load(primals_268, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_98, i1) + load(primals_99, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_21}
    )), InputBuffer(name='primals_100', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_33}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf94', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf93, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_270, i1) * reciprocal(sqrt(load(primals_271, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_101, i1) + load(primals_102, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_67}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf95,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf94', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf93, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_270, i1) * reciprocal(sqrt(load(primals_271, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_101, i1) + load(primals_102, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_67}
    )), InputBuffer(name='primals_103', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_34}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf99', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf98, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_9}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_19}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_8}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf100,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf99', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf98, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_9}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_19}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_35}
    ), InputBuffer(name='primals_106', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_35}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf101', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf100, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_276, i1) * reciprocal(sqrt(load(primals_277, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_107, i1) + load(primals_108, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_23}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf102,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf101', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf100, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_276, i1) * reciprocal(sqrt(load(primals_277, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_107, i1) + load(primals_108, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_23}
    )), InputBuffer(name='primals_109', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_36}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf103', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf102, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_279, i1) * reciprocal(sqrt(load(primals_280, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_110, i1) + load(primals_111, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_73}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf104,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf103', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf102, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_279, i1) * reciprocal(sqrt(load(primals_280, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_110, i1) + load(primals_111, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_73}
    )), InputBuffer(name='primals_112', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_37}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf108', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf107, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_10}
        ))
      ),
      size=(2, 232, 14, 14),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_21}
    ),
    size=[2, 116, 14, 14],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 116, i2, i3],
    origins={split_9}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf109,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf108', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf107, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
            ranges=[2, 116, 2, 14, 14],
            origins={clone_10}
          ))
        ),
        size=(2, 232, 14, 14),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_21}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 116, 14, 14], stride=[45472, 196, 14, 1], offset=22736),
      origins={convolution_38}
    ), InputBuffer(name='primals_115', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_38}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf110', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf109, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_285, i1) * reciprocal(sqrt(load(primals_286, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_116, i1) + load(primals_117, i1)),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={relu_25}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf111,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf110', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf109, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_285, i1) * reciprocal(sqrt(load(primals_286, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_116, i1) + load(primals_117, i1)),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={relu_25}
    )), InputBuffer(name='primals_118', layout=FixedLayout('cuda', torch.float32, size=[116, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 116),
    kwargs={},
    output_view=None,
    origins={convolution_39}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf112', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf111, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_288, i1) * reciprocal(sqrt(load(primals_289, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_119, i1) + load(primals_120, i1),
    ranges=torch.Size([2, 116, 14, 14]),
    origins={add_79}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf113,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]),
    inputs=[ComputedBuffer(name='buf112', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 116, 14, 14]), stride=[22736, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf111, i3 + 14 * i2 + 196 * i1 + 22736 * i0) - load(primals_288, i1) * reciprocal(sqrt(load(primals_289, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_119, i1) + load(primals_120, i1),
      ranges=torch.Size([2, 116, 14, 14]),
      origins={add_79}
    )), InputBuffer(name='primals_121', layout=FixedLayout('cuda', torch.float32, size=[116, 116, 1, 1], stride=[116, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_40}
  )
)), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf117', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        load(buf116, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
        ranges=[2, 116, 2, 14, 14],
        origins={clone_11}
      ))
    ),
    size=(2, 232, 14, 14),
    reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 116), ModularIndexing(i1, 1, 2), i2, i3],
    origins={view_23}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf118,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf117', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf116, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_11}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 232, 14, 14), stride=[45472, 196, 14, 1]),
      origins={convolution_41}
    ), InputBuffer(name='primals_124', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_41}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf119', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf118, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_294, i1) * reciprocal(sqrt(load(primals_295, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_125, i1) + load(primals_126, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_83}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf120,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf119', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf118, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_294, i1) * reciprocal(sqrt(load(primals_295, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_125, i1) + load(primals_126, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_83}
    )), InputBuffer(name='primals_127', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_42}
  )
)), TensorBox(StorageBox(
  Convolution(
    name=buf122,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 14, 14]), stride=[45472, 196, 14, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf117', layout=FixedLayout('cuda', torch.float32, size=[2, 116, 2, 14, 14], stride=[45472, 392, 196, 14, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf116, i4 + 14 * i3 + 196 * i1 + 22736 * i2 + 45472 * i0),
          ranges=[2, 116, 2, 14, 14],
          origins={clone_11}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 232, 14, 14), stride=[45472, 196, 14, 1]),
      origins={convolution_43}
    ), InputBuffer(name='primals_130', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_43}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf123', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 14, 14]), stride=[45472, 196, 14, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf122, i3 + 14 * i2 + 196 * i1 + 45472 * i0) - load(primals_300, i1) * reciprocal(sqrt(load(primals_301, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_131, i1) + load(primals_132, i1)),
    ranges=torch.Size([2, 232, 14, 14]),
    origins={relu_28}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf124,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf123', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 14, 14]), stride=[45472, 196, 14, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf122, i3 + 14 * i2 + 196 * i1 + 45472 * i0) - load(primals_300, i1) * reciprocal(sqrt(load(primals_301, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_131, i1) + load(primals_132, i1)),
      ranges=torch.Size([2, 232, 14, 14]),
      origins={relu_28}
    )), InputBuffer(name='primals_133', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (2, 2), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_44}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf125', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf124, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_303, i1) * reciprocal(sqrt(load(primals_304, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_134, i1) + load(primals_135, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_89}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf126,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf125', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf124, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_303, i1) * reciprocal(sqrt(load(primals_304, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_134, i1) + load(primals_135, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_89}
    )), InputBuffer(name='primals_136', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_45}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf129', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf128, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_12}
        ))
      ),
      size=(2, 464, 7, 7),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_25}
    ),
    size=[2, 232, 7, 7],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 232, i2, i3],
    origins={split_10}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf130,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf129', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf128, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
            ranges=[2, 232, 2, 7, 7],
            origins={clone_12}
          ))
        ),
        size=(2, 464, 7, 7),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_25}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 232, 7, 7], stride=[22736, 49, 7, 1], offset=11368),
      origins={convolution_46}
    ), InputBuffer(name='primals_139', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_46}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf131', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf130, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_309, i1) * reciprocal(sqrt(load(primals_310, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_140, i1) + load(primals_141, i1)),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={relu_30}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf132,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf131', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf130, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_309, i1) * reciprocal(sqrt(load(primals_310, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_140, i1) + load(primals_141, i1)),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={relu_30}
    )), InputBuffer(name='primals_142', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_47}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf133', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf132, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_312, i1) * reciprocal(sqrt(load(primals_313, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_143, i1) + load(primals_144, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_95}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf134,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf133', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf132, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_312, i1) * reciprocal(sqrt(load(primals_313, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_143, i1) + load(primals_144, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_95}
    )), InputBuffer(name='primals_145', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_48}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf138', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf137, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_13}
        ))
      ),
      size=(2, 464, 7, 7),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_27}
    ),
    size=[2, 232, 7, 7],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 232, i2, i3],
    origins={split_11}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf139,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf138', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf137, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
            ranges=[2, 232, 2, 7, 7],
            origins={clone_13}
          ))
        ),
        size=(2, 464, 7, 7),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_27}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 232, 7, 7], stride=[22736, 49, 7, 1], offset=11368),
      origins={convolution_49}
    ), InputBuffer(name='primals_148', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_49}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf140', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf139, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_318, i1) * reciprocal(sqrt(load(primals_319, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_149, i1) + load(primals_150, i1)),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={relu_32}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf141,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf140', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf139, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_318, i1) * reciprocal(sqrt(load(primals_319, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_149, i1) + load(primals_150, i1)),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={relu_32}
    )), InputBuffer(name='primals_151', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_50}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf142', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf141, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_321, i1) * reciprocal(sqrt(load(primals_322, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_152, i1) + load(primals_153, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_101}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf143,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf142', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf141, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_321, i1) * reciprocal(sqrt(load(primals_322, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_152, i1) + load(primals_153, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_101}
    )), InputBuffer(name='primals_154', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_51}
  )
)), TensorBox(
  SliceView(
    View(
      StorageBox(
        ComputedBuffer(name='buf147', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf146, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_14}
        ))
      ),
      size=(2, 464, 7, 7),
      reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
      origins={view_29}
    ),
    size=[2, 232, 7, 7],
    reindex=lambda i0, i1, i2, i3: [i0, i1 + 232, i2, i3],
    origins={split_12}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf148,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ReinterpretView(
      View(
        StorageBox(
          ComputedBuffer(name='buf147', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
            'cuda',
            torch.float32,
            load(buf146, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
            ranges=[2, 232, 2, 7, 7],
            origins={clone_14}
          ))
        ),
        size=(2, 464, 7, 7),
        reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
        origins={view_29}
      ),
      FixedLayout('cuda', torch.float32, size=[2, 232, 7, 7], stride=[22736, 49, 7, 1], offset=11368),
      origins={convolution_52}
    ), InputBuffer(name='primals_157', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_52}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf149', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf148, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_327, i1) * reciprocal(sqrt(load(primals_328, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_158, i1) + load(primals_159, i1)),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={relu_34}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf150,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf149', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      relu(load(buf148, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_327, i1) * reciprocal(sqrt(load(primals_328, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_158, i1) + load(primals_159, i1)),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={relu_34}
    )), InputBuffer(name='primals_160', layout=FixedLayout('cuda', torch.float32, size=[232, 1, 3, 3], stride=[9, 9, 3, 1]))],
    constant_args=(None, (1, 1), (1, 1), (1, 1), False, (0, 0), 232),
    kwargs={},
    output_view=None,
    origins={convolution_53}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf151', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf150, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_330, i1) * reciprocal(sqrt(load(primals_331, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_161, i1) + load(primals_162, i1),
    ranges=torch.Size([2, 232, 7, 7]),
    origins={add_107}
  ))
)), TensorBox(StorageBox(
  Convolution(
    name=buf152,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]),
    inputs=[ComputedBuffer(name='buf151', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 232, 7, 7]), stride=[11368, 49, 7, 1]), data=Pointwise(
      'cuda',
      torch.float32,
      load(buf150, i3 + 7 * i2 + 49 * i1 + 11368 * i0) - load(primals_330, i1) * reciprocal(sqrt(load(primals_331, i1) + constant(1e-05, torch.float32))) * constant(1, torch.float32) * load(primals_161, i1) + load(primals_162, i1),
      ranges=torch.Size([2, 232, 7, 7]),
      origins={add_107}
    )), InputBuffer(name='primals_163', layout=FixedLayout('cuda', torch.float32, size=[232, 232, 1, 1], stride=[232, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_54}
  )
)), TensorBox(
  View(
    StorageBox(
      ComputedBuffer(name='buf156', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
        'cuda',
        torch.float32,
        load(buf155, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
        ranges=[2, 232, 2, 7, 7],
        origins={clone_15}
      ))
    ),
    size=(2, 464, 7, 7),
    reindex=lambda i0, i1, i2, i3: [i0, ModularIndexing(i1, 2, 232), ModularIndexing(i1, 1, 2), i2, i3],
    origins={view_31}
  )
), TensorBox(StorageBox(
  Convolution(
    name=buf157,
    layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 1024, 7, 7]), stride=[50176, 49, 7, 1]),
    inputs=[ReinterpretView(
      StorageBox(
        ComputedBuffer(name='buf156', layout=FixedLayout('cuda', torch.float32, size=[2, 232, 2, 7, 7], stride=[22736, 98, 49, 7, 1]), data=Pointwise(
          'cuda',
          torch.float32,
          load(buf155, i4 + 7 * i3 + 49 * i1 + 11368 * i2 + 22736 * i0),
          ranges=[2, 232, 2, 7, 7],
          origins={clone_15}
        ))
      ),
      FixedLayout('cuda', torch.float32, size=(2, 464, 7, 7), stride=[22736, 49, 7, 1]),
      origins={convolution_55}
    ), InputBuffer(name='primals_166', layout=FixedLayout('cuda', torch.float32, size=[1024, 464, 1, 1], stride=[464, 1, 1, 1]))],
    constant_args=(None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1),
    kwargs={},
    output_view=None,
    origins={convolution_55}
  )
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf160', layout=FlexibleLayout('cuda', torch.float32, size=[2, 1024], stride=[1024, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    load(buf159, i1 + 1024 * i0) / index_expr(49, torch.float32),
    ranges=[2, 1024],
    origins={mean}
  ))
)), TensorBox(
  ReinterpretView(
    StorageBox(
      InputBuffer(name='primals_169', layout=FixedLayout('cuda', torch.float32, size=[1000, 1024], stride=[1024, 1]))
    ),
    FixedLayout('cuda', torch.float32, size=[1000, 1024], stride=[1024, 1]),
    origins={permute_17}
  )
), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf158, i3 + 7 * i2 + 49 * i1 + 50176 * i0) <= constant(0, torch.float32),
    ranges=torch.Size([2, 1024, 7, 7]),
    origins={le}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf153, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_1}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf144, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_3}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf135, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_5}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf127, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_7}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf121, i3 + 7 * i2 + 49 * i1 + 22736 * i0) <= constant(0, torch.float32),
    ranges=[2, 232, 7, 7],
    origins={le_9}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf114, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_10}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf105, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_12}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf96, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_14}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf87, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_16}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf78, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_18}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf69, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_20}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf60, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_22}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf52, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_24}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf46, i3 + 14 * i2 + 196 * i1 + 45472 * i0) <= constant(0, torch.float32),
    ranges=[2, 116, 14, 14],
    origins={le_26}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf39, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_27}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf30, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_29}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf21, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_31}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf13, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_33}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf7, i3 + 28 * i2 + 784 * i1 + 90944 * i0) <= constant(0, torch.float32),
    ranges=[2, 58, 28, 28],
    origins={le_35}
  )
)), s0, 28, 28, 14, 14, 7, 7]

While executing return [addmm, primals_1, primals_2, primals_4, primals_5, primals_7, primals_8, primals_10, primals_11, primals_13, primals_14, primals_16, primals_17, primals_19, primals_20, primals_22, primals_23, primals_25, primals_26, primals_28, primals_29, primals_31, primals_32, primals_34, primals_35, primals_37, primals_38, primals_40, primals_41, primals_43, primals_44, primals_46, primals_47, primals_49, primals_50, primals_52, primals_53, primals_55, primals_56, primals_58, primals_59, primals_61, primals_62, primals_64, primals_65, primals_67, primals_68, primals_70, primals_71, primals_73, primals_74, primals_76, primals_77, primals_79, primals_80, primals_82, primals_83, primals_85, primals_86, primals_88, primals_89, primals_91, primals_92, primals_94, primals_95, primals_97, primals_98, primals_100, primals_101, primals_103, primals_104, primals_106, primals_107, primals_109, primals_110, primals_112, primals_113, primals_115, primals_116, primals_118, primals_119, primals_121, primals_122, primals_124, primals_125, primals_127, primals_128, primals_130, primals_131, primals_133, primals_134, primals_136, primals_137, primals_139, primals_140, primals_142, primals_143, primals_145, primals_146, primals_148, primals_149, primals_151, primals_152, primals_154, primals_155, primals_157, primals_158, primals_160, primals_161, primals_163, primals_164, primals_166, primals_167, primals_171, primals_172, primals_174, primals_175, primals_177, primals_178, primals_180, primals_181, primals_183, primals_184, primals_186, primals_187, primals_189, primals_190, primals_192, primals_193, primals_195, primals_196, primals_198, primals_199, primals_201, primals_202, primals_204, primals_205, primals_207, primals_208, primals_210, primals_211, primals_213, primals_214, primals_216, primals_217, primals_219, primals_220, primals_222, primals_223, primals_225, primals_226, primals_228, primals_229, primals_231, primals_232, primals_234, primals_235, primals_237, primals_238, primals_240, primals_241, primals_243, primals_244, primals_246, primals_247, primals_249, primals_250, primals_252, primals_253, primals_255, primals_256, primals_258, primals_259, primals_261, primals_262, primals_264, primals_265, primals_267, primals_268, primals_270, primals_271, primals_273, primals_274, primals_276, primals_277, primals_279, primals_280, primals_282, primals_283, primals_285, primals_286, primals_288, primals_289, primals_291, primals_292, primals_294, primals_295, primals_297, primals_298, primals_300, primals_301, primals_303, primals_304, primals_306, primals_307, primals_309, primals_310, primals_312, primals_313, primals_315, primals_316, primals_318, primals_319, primals_321, primals_322, primals_324, primals_325, primals_327, primals_328, primals_330, primals_331, primals_333, primals_334, primals_336, primals_337, primals_339, convolution, relu, getitem, getitem_1, convolution_1, add_3, convolution_2, convolution_3, relu_2, convolution_4, add_9, convolution_5, getitem_3, convolution_6, relu_4, convolution_7, add_15, convolution_8, getitem_5, convolution_9, relu_6, convolution_10, add_21, convolution_11, getitem_7, convolution_12, relu_8, convolution_13, add_27, convolution_14, view_7, convolution_15, add_31, convolution_16, convolution_17, relu_11, convolution_18, add_37, convolution_19, getitem_9, convolution_20, relu_13, convolution_21, add_43, convolution_22, getitem_11, convolution_23, relu_15, convolution_24, add_49, convolution_25, getitem_13, convolution_26, relu_17, convolution_27, add_55, convolution_28, getitem_15, convolution_29, relu_19, convolution_30, add_61, convolution_31, getitem_17, convolution_32, relu_21, convolution_33, add_67, convolution_34, getitem_19, convolution_35, relu_23, convolution_36, add_73, convolution_37, getitem_21, convolution_38, relu_25, convolution_39, add_79, convolution_40, view_23, convolution_41, add_83, convolution_42, convolution_43, relu_28, convolution_44, add_89, convolution_45, getitem_23, convolution_46, relu_30, convolution_47, add_95, convolution_48, getitem_25, convolution_49, relu_32, convolution_50, add_101, convolution_51, getitem_27, convolution_52, relu_34, convolution_53, add_107, convolution_54, view_31, convolution_55, mean, permute_17, le, le_1, le_3, le_5, le_7, le_9, le_10, le_12, le_14, le_16, le_18, le_20, le_22, le_24, le_26, le_27, le_29, le_31, le_33, le_35, sym_size, sym_size_1, sym_size_2, sym_size_3, sym_size_4, sym_size_5, sym_size_6]
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train shufflenet_v2_x1_0                 FAIL
/scratch/ezyang/work/env/lib/python3.9/site-packages/gym/core.py:317: DeprecationWarning: [33mWARN: Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.[0m
  deprecation(
/scratch/ezyang/work/env/lib/python3.9/site-packages/gym/wrappers/step_api_compatibility.py:39: DeprecationWarning: [33mWARN: Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future.[0m
  deprecation(
Running torchbench.py soft_actor_critic...
cuda train soft_actor_critic                  PASS
Running torchbench.py speech_transformer...
ERROR:common:
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/speech_transformer/speech_transformer/transformer/transformer.py", line 28, in forward
    encoder_padded_outputs, *_ = self.encoder(padded_input, input_lengths)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/speech_transformer/speech_transformer/transformer/encoder.py", line 48, in forward
    non_pad_mask = get_non_pad_mask(padded_input, input_lengths=input_lengths)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/speech_transformer/speech_transformer/transformer/encoder.py", line 50, in <graph break in forward>
    slf_attn_mask = get_attn_pad_mask(padded_input, input_lengths, length)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/speech_transformer/speech_transformer/transformer/encoder.py", line 55, in <graph break in forward>
    self.positional_encoding(padded_input))
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/speech_transformer/speech_transformer/transformer/encoder.py", line 55, in <graph break in forward>
    self.positional_encoding(padded_input))
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 122, in compile_fx_inner
    compiled_fn = graph.compile_to_fn()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 349, in compile_to_fn
    return self.compile_to_module().call
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 335, in compile_to_module
    code = self.codegen()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 326, in codegen
    self.wrapper_code = WrapperCodeGen()
  File "/scratch/ezyang/work/pytorch/torch/_inductor/codegen/wrapper.py", line 240, in __init__
    V.graph.sizevars.codegen(self.prefix, V.graph.graph_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/sizevars.py", line 481, in codegen
    assert not needed
AssertionError
TorchDynamo optimized model failed to run because of following error
cuda train speech_transformer                 FAIL
Running torchbench.py squeezenet1_1...
ERROR:common:[TensorBox(
  View(
    StorageBox(
      Pointwise(
        'cuda',
        torch.float32,
        load(buf65, i1 + 1000 * i0) / index_expr(169, torch.float32),
        ranges=[2, 1000, 1, 1],
        origins={mean}
      )
    ),
    size=(2, 1000),
    reindex=lambda i0, i1: [i0, i1, 0, 0],
    origins={view}
  )
), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 3, 3, 3], stride=[27, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_3', layout=FixedLayout('cuda', torch.float32, size=[16, 64, 1, 1], stride=[64, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_5', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 1, 1], stride=[16, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 3, 3], stride=[144, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_9', layout=FixedLayout('cuda', torch.float32, size=[16, 128, 1, 1], stride=[128, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_11', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 1, 1], stride=[16, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 3, 3], stride=[144, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_15', layout=FixedLayout('cuda', torch.float32, size=[32, 128, 1, 1], stride=[128, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_17', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 1, 1], stride=[32, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 3, 3], stride=[288, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_21', layout=FixedLayout('cuda', torch.float32, size=[32, 256, 1, 1], stride=[256, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_23', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 1, 1], stride=[32, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 3, 3], stride=[288, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_27', layout=FixedLayout('cuda', torch.float32, size=[48, 256, 1, 1], stride=[256, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_29', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 1, 1], stride=[48, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 3, 3], stride=[432, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_33', layout=FixedLayout('cuda', torch.float32, size=[48, 384, 1, 1], stride=[384, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_35', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 1, 1], stride=[48, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 3, 3], stride=[432, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_39', layout=FixedLayout('cuda', torch.float32, size=[64, 384, 1, 1], stride=[384, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_41', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 1, 1], stride=[64, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_45', layout=FixedLayout('cuda', torch.float32, size=[64, 512, 1, 1], stride=[512, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_47', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 1, 1], stride=[64, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_51', layout=FixedLayout('cuda', torch.float32, size=[1000, 512, 1, 1], stride=[512, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_53', layout=FixedLayout('cuda', torch.float32, size=[s0, 3, s2, s2], stride=[3*s2**2, s2**2, s2, 1]))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf1', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 64, 111, 111]), stride=[788544, 12321, 111, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf0, i3 + 111 * i2 + 12321 * i1 + 788544 * i0) + load(primals_2, i1)),
    ranges=torch.Size([2, 64, 111, 111]),
    origins={relu}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[193600, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(load(buf1, 224 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 223 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))))))))),
    ranges=[2, 64, 55, 55],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf3', layout=FlexibleLayout('cuda', torch.int64, size=[2, 64, 55, 55], stride=[193600, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(load(buf1, 224 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 223 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)))))))), index_expr(224 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 223 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))))))), index_expr(223 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)))))), index_expr(222 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))))), index_expr(113 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)))), index_expr(112 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))), index_expr(111 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)), index_expr(2 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), index_expr(1 + 2 * i3 + 222 * i2, torch.int64), index_expr(2 * i3 + 222 * i2, torch.int64))))))))),
    ranges=[2, 64, 55, 55],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 16, 55, 55]), stride=[48400, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf4, i3 + 55 * i2 + 3025 * i1 + 48400 * i0) + load(primals_4, i1)),
    ranges=torch.Size([2, 16, 55, 55]),
    origins={relu_1}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf10', layout=FixedLayout('cuda', torch.float32, size=[2, 128, 55, 55], stride=[387200, 3025, 55, 1]), inputs=[ComputedBuffer(name='buf8', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf6, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_6, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_2}
  )), ComputedBuffer(name='buf9', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf7, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_8, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_3}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf12', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 16, 55, 55]), stride=[48400, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf11, i3 + 55 * i2 + 3025 * i1 + 48400 * i0) + load(primals_10, i1)),
    ranges=torch.Size([2, 16, 55, 55]),
    origins={relu_4}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf17', layout=FixedLayout('cuda', torch.float32, size=[2, 128, 55, 55], stride=[387200, 3025, 55, 1]), inputs=[ComputedBuffer(name='buf15', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf13, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_12, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_5}
  )), ComputedBuffer(name='buf16', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf14, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_14, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_6}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf18', layout=FixedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[93312, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(load(buf17, 112 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 111 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))))))))),
    ranges=[2, 128, 27, 27],
    origins={max_pool2d_with_indices_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf19', layout=FlexibleLayout('cuda', torch.int64, size=[2, 128, 27, 27], stride=[93312, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(load(buf17, 112 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 111 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)))))))), index_expr(112 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 111 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))))))), index_expr(111 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)))))), index_expr(110 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))))), index_expr(57 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)))), index_expr(56 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))), index_expr(55 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)), index_expr(2 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), index_expr(1 + 2 * i3 + 110 * i2, torch.int64), index_expr(2 * i3 + 110 * i2, torch.int64))))))))),
    ranges=[2, 128, 27, 27],
    origins={max_pool2d_with_indices_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf21', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 32, 27, 27]), stride=[23328, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf20, i3 + 27 * i2 + 729 * i1 + 23328 * i0) + load(primals_16, i1)),
    ranges=torch.Size([2, 32, 27, 27]),
    origins={relu_7}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf26', layout=FixedLayout('cuda', torch.float32, size=[2, 256, 27, 27], stride=[186624, 729, 27, 1]), inputs=[ComputedBuffer(name='buf24', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf22, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_18, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_8}
  )), ComputedBuffer(name='buf25', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf23, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_20, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_9}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf28', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 32, 27, 27]), stride=[23328, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf27, i3 + 27 * i2 + 729 * i1 + 23328 * i0) + load(primals_22, i1)),
    ranges=torch.Size([2, 32, 27, 27]),
    origins={relu_10}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf33', layout=FixedLayout('cuda', torch.float32, size=[2, 256, 27, 27], stride=[186624, 729, 27, 1]), inputs=[ComputedBuffer(name='buf31', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf29, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_24, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_11}
  )), ComputedBuffer(name='buf32', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf30, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_26, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_12}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf34', layout=FixedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[43264, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(load(buf33, 56 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 55 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))))))))),
    ranges=[2, 256, 13, 13],
    origins={max_pool2d_with_indices_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf35', layout=FlexibleLayout('cuda', torch.int64, size=[2, 256, 13, 13], stride=[43264, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(load(buf33, 56 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 55 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)))))))), index_expr(56 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 55 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))))))), index_expr(55 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)))))), index_expr(54 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))))), index_expr(29 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)))), index_expr(28 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))), index_expr(27 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)), index_expr(2 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), index_expr(1 + 2 * i3 + 54 * i2, torch.int64), index_expr(2 * i3 + 54 * i2, torch.int64))))))))),
    ranges=[2, 256, 13, 13],
    origins={max_pool2d_with_indices_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 48, 13, 13]), stride=[8112, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf36, i3 + 13 * i2 + 169 * i1 + 8112 * i0) + load(primals_28, i1)),
    ranges=torch.Size([2, 48, 13, 13]),
    origins={relu_13}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 384, 13, 13], stride=[64896, 169, 13, 1]), inputs=[ComputedBuffer(name='buf40', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf38, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_30, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_14}
  )), ComputedBuffer(name='buf41', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf39, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_32, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_15}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 48, 13, 13]), stride=[8112, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf43, i3 + 13 * i2 + 169 * i1 + 8112 * i0) + load(primals_34, i1)),
    ranges=torch.Size([2, 48, 13, 13]),
    origins={relu_16}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf49', layout=FixedLayout('cuda', torch.float32, size=[2, 384, 13, 13], stride=[64896, 169, 13, 1]), inputs=[ComputedBuffer(name='buf47', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf45, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_36, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_17}
  )), ComputedBuffer(name='buf48', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf46, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_38, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_18}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf51', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 64, 13, 13]), stride=[10816, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf50, i3 + 13 * i2 + 169 * i1 + 10816 * i0) + load(primals_40, i1)),
    ranges=torch.Size([2, 64, 13, 13]),
    origins={relu_19}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf56', layout=FixedLayout('cuda', torch.float32, size=[2, 512, 13, 13], stride=[86528, 169, 13, 1]), inputs=[ComputedBuffer(name='buf54', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf52, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_42, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_20}
  )), ComputedBuffer(name='buf55', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf53, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_44, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_21}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf58', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 64, 13, 13]), stride=[10816, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf57, i3 + 13 * i2 + 169 * i1 + 10816 * i0) + load(primals_46, i1)),
    ranges=torch.Size([2, 64, 13, 13]),
    origins={relu_22}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf63', layout=FixedLayout('cuda', torch.float32, size=[2, 512, 13, 13], stride=[86528, 169, 13, 1]), inputs=[ComputedBuffer(name='buf61', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf59, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_48, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_23}
  )), ComputedBuffer(name='buf62', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf60, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_50, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_24}
  ))])
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    relu(load(buf64, i3 + 13 * i2 + 169 * i1 + 169000 * i0) + load(primals_52, i1)) <= constant(0, torch.float32),
    ranges=torch.Size([2, 1000, 13, 13]),
    origins={le}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf62, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_1}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf61, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_2}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf55, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_4}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf54, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_5}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf48, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_7}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf47, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_8}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf41, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_10}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf40, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_11}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf32, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_13}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf31, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_14}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf25, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_16}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf24, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_17}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf16, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_19}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf15, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_20}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf9, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_22}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf8, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_23}
  )
)), s0, 13, 13]

While executing return [view, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, relu, getitem, getitem_1, relu_1, cat, relu_4, cat_1, getitem_2, getitem_3, relu_7, cat_2, relu_10, cat_3, getitem_4, getitem_5, relu_13, cat_4, relu_16, cat_5, relu_19, cat_6, relu_22, cat_7, le, le_1, le_2, le_4, le_5, le_7, le_8, le_10, le_11, le_13, le_14, le_16, le_17, le_19, le_20, le_22, le_23, sym_size, sym_size_1, sym_size_2]
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchvision/torchvision/models/squeezenet.py", line 94, in forward
    def forward(self, x: torch.Tensor) -> torch.Tensor:
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 516, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 351, in fw_compiler
    return compile_fx_inner(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/debug_utils.py", line 444, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/debug.py", line 177, in inner
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/env/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/compile_fx.py", line 121, in compile_fx_inner
    graph.run(*example_inputs)
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/utils.py", line 87, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 129, in run
    return super().run(*args)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 130, in run
    self.env[node] = self.run_node(node)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 299, in run_node
    result = super().run_node(n)
  File "/scratch/ezyang/work/pytorch/torch/fx/interpreter.py", line 171, in run_node
    return getattr(self, n.op)(n.target, args, kwargs)
  File "/scratch/ezyang/work/pytorch/torch/_inductor/graph.py", line 267, in output
    assert all(
AssertionError: [TensorBox(
  View(
    StorageBox(
      Pointwise(
        'cuda',
        torch.float32,
        load(buf65, i1 + 1000 * i0) / index_expr(169, torch.float32),
        ranges=[2, 1000, 1, 1],
        origins={mean}
      )
    ),
    size=(2, 1000),
    reindex=lambda i0, i1: [i0, i1, 0, 0],
    origins={view}
  )
), TensorBox(StorageBox(
  InputBuffer(name='primals_1', layout=FixedLayout('cuda', torch.float32, size=[64, 3, 3, 3], stride=[27, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_3', layout=FixedLayout('cuda', torch.float32, size=[16, 64, 1, 1], stride=[64, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_5', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 1, 1], stride=[16, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_7', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 3, 3], stride=[144, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_9', layout=FixedLayout('cuda', torch.float32, size=[16, 128, 1, 1], stride=[128, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_11', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 1, 1], stride=[16, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_13', layout=FixedLayout('cuda', torch.float32, size=[64, 16, 3, 3], stride=[144, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_15', layout=FixedLayout('cuda', torch.float32, size=[32, 128, 1, 1], stride=[128, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_17', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 1, 1], stride=[32, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_19', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 3, 3], stride=[288, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_21', layout=FixedLayout('cuda', torch.float32, size=[32, 256, 1, 1], stride=[256, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_23', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 1, 1], stride=[32, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_25', layout=FixedLayout('cuda', torch.float32, size=[128, 32, 3, 3], stride=[288, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_27', layout=FixedLayout('cuda', torch.float32, size=[48, 256, 1, 1], stride=[256, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_29', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 1, 1], stride=[48, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_31', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 3, 3], stride=[432, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_33', layout=FixedLayout('cuda', torch.float32, size=[48, 384, 1, 1], stride=[384, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_35', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 1, 1], stride=[48, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_37', layout=FixedLayout('cuda', torch.float32, size=[192, 48, 3, 3], stride=[432, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_39', layout=FixedLayout('cuda', torch.float32, size=[64, 384, 1, 1], stride=[384, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_41', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 1, 1], stride=[64, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_43', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_45', layout=FixedLayout('cuda', torch.float32, size=[64, 512, 1, 1], stride=[512, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_47', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 1, 1], stride=[64, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_49', layout=FixedLayout('cuda', torch.float32, size=[256, 64, 3, 3], stride=[576, 9, 3, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_51', layout=FixedLayout('cuda', torch.float32, size=[1000, 512, 1, 1], stride=[512, 1, 1, 1]))
)), TensorBox(StorageBox(
  InputBuffer(name='primals_53', layout=FixedLayout('cuda', torch.float32, size=[s0, 3, s2, s2], stride=[3*s2**2, s2**2, s2, 1]))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf1', layout=FlexibleLayout('cuda', torch.float32, size=torch.Size([2, 64, 111, 111]), stride=[788544, 12321, 111, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf0, i3 + 111 * i2 + 12321 * i1 + 788544 * i0) + load(primals_2, i1)),
    ranges=torch.Size([2, 64, 111, 111]),
    origins={relu}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf2', layout=FixedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[193600, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(load(buf1, 224 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 223 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))))))))),
    ranges=[2, 64, 55, 55],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf3', layout=FlexibleLayout('cuda', torch.int64, size=[2, 64, 55, 55], stride=[193600, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(load(buf1, 224 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 223 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)))))))), index_expr(224 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 223 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))))))), index_expr(223 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 222 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)))))), index_expr(222 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 113 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))))), index_expr(113 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 112 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)))), index_expr(112 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 111 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0))), index_expr(111 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 2 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > maximum(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0)), index_expr(2 + 2 * i3 + 222 * i2, torch.int64), where(load(buf1, 1 + 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0) > load(buf1, 2 * i3 + 222 * i2 + 12321 * i1 + 788544 * i0), index_expr(1 + 2 * i3 + 222 * i2, torch.int64), index_expr(2 * i3 + 222 * i2, torch.int64))))))))),
    ranges=[2, 64, 55, 55],
    origins={max_pool2d_with_indices}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf5', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 16, 55, 55]), stride=[48400, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf4, i3 + 55 * i2 + 3025 * i1 + 48400 * i0) + load(primals_4, i1)),
    ranges=torch.Size([2, 16, 55, 55]),
    origins={relu_1}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf10', layout=FixedLayout('cuda', torch.float32, size=[2, 128, 55, 55], stride=[387200, 3025, 55, 1]), inputs=[ComputedBuffer(name='buf8', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf6, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_6, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_2}
  )), ComputedBuffer(name='buf9', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf7, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_8, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_3}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf12', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 16, 55, 55]), stride=[48400, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf11, i3 + 55 * i2 + 3025 * i1 + 48400 * i0) + load(primals_10, i1)),
    ranges=torch.Size([2, 16, 55, 55]),
    origins={relu_4}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf17', layout=FixedLayout('cuda', torch.float32, size=[2, 128, 55, 55], stride=[387200, 3025, 55, 1]), inputs=[ComputedBuffer(name='buf15', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf13, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_12, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_5}
  )), ComputedBuffer(name='buf16', layout=AliasedLayout('cuda', torch.float32, size=[2, 64, 55, 55], stride=[387200, 3025, 55, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf14, i3 + 55 * i2 + 3025 * i1 + 193600 * i0) + load(primals_14, i1)),
    ranges=torch.Size([2, 64, 55, 55]),
    origins={relu_6}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf18', layout=FixedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[93312, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(load(buf17, 112 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 111 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))))))))),
    ranges=[2, 128, 27, 27],
    origins={max_pool2d_with_indices_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf19', layout=FlexibleLayout('cuda', torch.int64, size=[2, 128, 27, 27], stride=[93312, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(load(buf17, 112 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 111 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)))))))), index_expr(112 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 111 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))))))), index_expr(111 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 110 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)))))), index_expr(110 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 57 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))))), index_expr(57 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 56 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)))), index_expr(56 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 55 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0))), index_expr(55 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 2 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > maximum(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0)), index_expr(2 + 2 * i3 + 110 * i2, torch.int64), where(load(buf17, 1 + 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0) > load(buf17, 2 * i3 + 110 * i2 + 3025 * i1 + 387200 * i0), index_expr(1 + 2 * i3 + 110 * i2, torch.int64), index_expr(2 * i3 + 110 * i2, torch.int64))))))))),
    ranges=[2, 128, 27, 27],
    origins={max_pool2d_with_indices_1}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf21', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 32, 27, 27]), stride=[23328, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf20, i3 + 27 * i2 + 729 * i1 + 23328 * i0) + load(primals_16, i1)),
    ranges=torch.Size([2, 32, 27, 27]),
    origins={relu_7}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf26', layout=FixedLayout('cuda', torch.float32, size=[2, 256, 27, 27], stride=[186624, 729, 27, 1]), inputs=[ComputedBuffer(name='buf24', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf22, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_18, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_8}
  )), ComputedBuffer(name='buf25', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf23, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_20, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_9}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf28', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 32, 27, 27]), stride=[23328, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf27, i3 + 27 * i2 + 729 * i1 + 23328 * i0) + load(primals_22, i1)),
    ranges=torch.Size([2, 32, 27, 27]),
    origins={relu_10}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf33', layout=FixedLayout('cuda', torch.float32, size=[2, 256, 27, 27], stride=[186624, 729, 27, 1]), inputs=[ComputedBuffer(name='buf31', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf29, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_24, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_11}
  )), ComputedBuffer(name='buf32', layout=AliasedLayout('cuda', torch.float32, size=[2, 128, 27, 27], stride=[186624, 729, 27, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf30, i3 + 27 * i2 + 729 * i1 + 93312 * i0) + load(primals_26, i1)),
    ranges=torch.Size([2, 128, 27, 27]),
    origins={relu_12}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf34', layout=FixedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[43264, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    maximum(load(buf33, 56 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 55 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))))))))),
    ranges=[2, 256, 13, 13],
    origins={max_pool2d_with_indices_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf35', layout=FlexibleLayout('cuda', torch.int64, size=[2, 256, 13, 13], stride=[43264, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.int64,
    where(load(buf33, 56 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 55 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)))))))), index_expr(56 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 55 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))))))), index_expr(55 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 54 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)))))), index_expr(54 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 29 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))))), index_expr(29 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 28 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)))), index_expr(28 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 27 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0))), index_expr(27 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 2 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > maximum(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0)), index_expr(2 + 2 * i3 + 54 * i2, torch.int64), where(load(buf33, 1 + 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0) > load(buf33, 2 * i3 + 54 * i2 + 729 * i1 + 186624 * i0), index_expr(1 + 2 * i3 + 54 * i2, torch.int64), index_expr(2 * i3 + 54 * i2, torch.int64))))))))),
    ranges=[2, 256, 13, 13],
    origins={max_pool2d_with_indices_2}
  ))
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf37', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 48, 13, 13]), stride=[8112, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf36, i3 + 13 * i2 + 169 * i1 + 8112 * i0) + load(primals_28, i1)),
    ranges=torch.Size([2, 48, 13, 13]),
    origins={relu_13}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf42', layout=FixedLayout('cuda', torch.float32, size=[2, 384, 13, 13], stride=[64896, 169, 13, 1]), inputs=[ComputedBuffer(name='buf40', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf38, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_30, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_14}
  )), ComputedBuffer(name='buf41', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf39, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_32, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_15}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf44', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 48, 13, 13]), stride=[8112, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf43, i3 + 13 * i2 + 169 * i1 + 8112 * i0) + load(primals_34, i1)),
    ranges=torch.Size([2, 48, 13, 13]),
    origins={relu_16}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf49', layout=FixedLayout('cuda', torch.float32, size=[2, 384, 13, 13], stride=[64896, 169, 13, 1]), inputs=[ComputedBuffer(name='buf47', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf45, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_36, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_17}
  )), ComputedBuffer(name='buf48', layout=AliasedLayout('cuda', torch.float32, size=[2, 192, 13, 13], stride=[64896, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf46, i3 + 13 * i2 + 169 * i1 + 32448 * i0) + load(primals_38, i1)),
    ranges=torch.Size([2, 192, 13, 13]),
    origins={relu_18}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf51', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 64, 13, 13]), stride=[10816, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf50, i3 + 13 * i2 + 169 * i1 + 10816 * i0) + load(primals_40, i1)),
    ranges=torch.Size([2, 64, 13, 13]),
    origins={relu_19}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf56', layout=FixedLayout('cuda', torch.float32, size=[2, 512, 13, 13], stride=[86528, 169, 13, 1]), inputs=[ComputedBuffer(name='buf54', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf52, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_42, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_20}
  )), ComputedBuffer(name='buf55', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf53, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_44, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_21}
  ))])
)), TensorBox(StorageBox(
  ComputedBuffer(name='buf58', layout=FixedLayout('cuda', torch.float32, size=torch.Size([2, 64, 13, 13]), stride=[10816, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf57, i3 + 13 * i2 + 169 * i1 + 10816 * i0) + load(primals_46, i1)),
    ranges=torch.Size([2, 64, 13, 13]),
    origins={relu_22}
  ))
)), TensorBox(StorageBox(
  ConcatKernel(name='buf63', layout=FixedLayout('cuda', torch.float32, size=[2, 512, 13, 13], stride=[86528, 169, 13, 1]), inputs=[ComputedBuffer(name='buf61', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf59, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_48, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_23}
  )), ComputedBuffer(name='buf62', layout=AliasedLayout('cuda', torch.float32, size=[2, 256, 13, 13], stride=[86528, 169, 13, 1]), data=Pointwise(
    'cuda',
    torch.float32,
    relu(load(buf60, i3 + 13 * i2 + 169 * i1 + 43264 * i0) + load(primals_50, i1)),
    ranges=torch.Size([2, 256, 13, 13]),
    origins={relu_24}
  ))])
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    relu(load(buf64, i3 + 13 * i2 + 169 * i1 + 169000 * i0) + load(primals_52, i1)) <= constant(0, torch.float32),
    ranges=torch.Size([2, 1000, 13, 13]),
    origins={le}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf62, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_1}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf61, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_2}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf55, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_4}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf54, i3 + 13 * i2 + 169 * i1 + 86528 * i0) <= constant(0, torch.float32),
    ranges=[2, 256, 13, 13],
    origins={le_5}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf48, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_7}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf47, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_8}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf41, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_10}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf40, i3 + 13 * i2 + 169 * i1 + 64896 * i0) <= constant(0, torch.float32),
    ranges=[2, 192, 13, 13],
    origins={le_11}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf32, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_13}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf31, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_14}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf25, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_16}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf24, i3 + 27 * i2 + 729 * i1 + 186624 * i0) <= constant(0, torch.float32),
    ranges=[2, 128, 27, 27],
    origins={le_17}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf16, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_19}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf15, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_20}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf9, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_22}
  )
)), TensorBox(StorageBox(
  Pointwise(
    'cuda',
    torch.bool,
    load(buf8, i3 + 55 * i2 + 3025 * i1 + 387200 * i0) <= constant(0, torch.float32),
    ranges=[2, 64, 55, 55],
    origins={le_23}
  )
)), s0, 13, 13]

While executing return [view, primals_1, primals_3, primals_5, primals_7, primals_9, primals_11, primals_13, primals_15, primals_17, primals_19, primals_21, primals_23, primals_25, primals_27, primals_29, primals_31, primals_33, primals_35, primals_37, primals_39, primals_41, primals_43, primals_45, primals_47, primals_49, primals_51, primals_53, relu, getitem, getitem_1, relu_1, cat, relu_4, cat_1, getitem_2, getitem_3, relu_7, cat_2, relu_10, cat_3, getitem_4, getitem_5, relu_13, cat_4, relu_16, cat_5, relu_19, cat_6, relu_22, cat_7, le, le_1, le_2, le_4, le_5, le_7, le_8, le_10, le_11, le_13, le_14, le_16, le_17, le_19, le_20, le_22, le_23, sym_size, sym_size_1, sym_size_2]
Original traceback:
None
TorchDynamo optimized model failed to run because of following error
cuda train squeezenet1_1                      FAIL
Running torchbench.py tacotron2...
ERROR:common:Cannot call sizes() on tensor with symbolic sizes/strides

While executing %lowmem_dropout_2 : [#users=1] = call_function[target=torch._inductor.overrides.lowmem_dropout](args = (%relu, 0.5, True), kwargs = {})
Original traceback:
None
Traceback (most recent call last):
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1122, in check_accuracy
    new_result = optimized_model_iter_fn(
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/common.py", line 1020, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 332, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/ezyang/work/pytorch/benchmarks/dynamo/torchbench.py", line 335, in <graph break in forward_and_backward_pass>
    pred = mod(*cloned_inputs)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/tacotron2/model.py", line 505, in forward
    encoder_outputs = self.encoder(embedded_inputs, text_lengths)
  File "/scratch/ezyang/work/pytorch/torch/nn/modules/module.py", line 1423, in _call_impl
    return forward_call(*input, **kwargs)
  File "/scratch/ezyang/work/torchbenchmark/torchbenchmark/models/tacotron2/model.py", line 173, in forward
    def forward(self, x, input_lengths):
  File "/scratch/ezyang/work/pytorch/torch/_dynamo/eval_frame.py", line 173, in _fn
    return fn(*args, **kwargs)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 954, in forward
    return compiled_f(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 940, in new_func
    compiled_fn = create_aot_dispatcher_function(
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 660, in create_aot_dispatcher_function
    aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
  File "/scratch/ezyang/work/pytorch/functorch/_src/aot_autograd.py", line 462, in aot_dispatch_autograd
    out = flat_fn(*flat_args)
  File "/