leslie-fang-intel/gist:f8b9df5aefdf72f2111d5237fb178ff0

## gistfile1.txt
from ctypes import c_void_p, c_long
import torch
import math
import random
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels

aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()


kernel_cpp_0 = async_compile.cpp('''
#include <ATen/record_function.h>
#include "/tmp/torchinductor_leslie/zt/cztcl2vp5yqlnhofzpqfficjcxgyict6e3xhfdd7sdbkipp4p44x.h"
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       const float* __restrict__ in_ptr1,
                       const long* __restrict__ in_ptr2,
                       unsigned char* __restrict__ out_ptr0)
{
    RECORD_FUNCTION("graph_0_kernel_cpp_0", c10::ArrayRef<c10::IValue>({}));
    #pragma omp parallel num_threads(56)
    {
        {
            #pragma omp for
            for(long i0=0; i0<17461248; i0+=1)
            {
                auto tmp2 = in_ptr0[i0];
                auto tmp3 = in_ptr1[0];
                auto tmp6 = in_ptr2[0];
                auto tmp0 = static_cast<float>(0);
                auto tmp1 = static_cast<float>(127);
                auto tmp4 = tmp2 / tmp3;
                auto tmp5 = std::nearbyint(tmp4);
                auto tmp7 = static_cast<float>(tmp6);
                auto tmp8 = tmp5 + tmp7;
                auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8);
                auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9);
                auto tmp11 = static_cast<unsigned char>(tmp10);
                out_ptr0[i0] = tmp11;
            }
        }
    }
}
''')


kernel_cpp_1 = async_compile.cpp('''
#include <ATen/record_function.h>
#include "/tmp/torchinductor_leslie/zt/cztcl2vp5yqlnhofzpqfficjcxgyict6e3xhfdd7sdbkipp4p44x.h"
extern "C" void kernel(const unsigned char* __restrict__ in_ptr0,
                       const long* __restrict__ in_ptr1,
                       const float* __restrict__ in_ptr2,
                       const float* __restrict__ in_ptr3,
                       const long* __restrict__ in_ptr4,
                       float* __restrict__ out_ptr0,
                       float* __restrict__ out_ptr1,
                       unsigned char* __restrict__ out_ptr2)
{
    RECORD_FUNCTION("graph_0_kernel_cpp_1", c10::ArrayRef<c10::IValue>({}));
    #pragma omp parallel num_threads(56)
    {
        {
            #pragma omp for
            for(long i0=0; i0<93126656; i0+=1)
            {
                auto tmp0 = in_ptr0[i0];
                auto tmp2 = in_ptr1[0];
                auto tmp5 = in_ptr2[0];
                auto tmp1 = static_cast<float>(tmp0);
                auto tmp3 = static_cast<float>(tmp2);
                auto tmp4 = tmp1 - tmp3;
                auto tmp6 = tmp4 * tmp5;
                out_ptr0[i0] = tmp6;
            }
        }
        {
            #pragma omp for
            for(long i0=0; i0<116; i0+=1)
            {
                #pragma GCC ivdep
                for(long i1=0; i1<64; i1+=1)
                {
                    #pragma GCC ivdep
                    for(long i2=0; i2<56; i2+=1)
                    {
                        #pragma GCC ivdep
                        for(long i3=0; i3<56; i3+=1)
                        {
                            auto tmp0 = static_cast<long>((-1) + (2*i2));
                            auto tmp1 = static_cast<long>(0);
                            auto tmp2 = tmp0 >= tmp1;
                            auto tmp3 = static_cast<long>(112);
                            auto tmp4 = tmp0 < tmp3;
                            auto tmp5 = tmp2 & tmp4;
                            auto tmp6 = static_cast<long>((-1) + (2*i3));
                            auto tmp7 = tmp6 >= tmp1;
                            auto tmp8 = tmp6 < tmp3;
                            auto tmp9 = tmp7 & tmp8;
                            auto tmp10 = tmp5 & tmp9;
                            auto tmp11 = [&]
                            {
                                auto tmp12 = out_ptr0[(-7232) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp12;
                            }
                            ;
                            auto tmp13 = tmp10 ? tmp11() : -std::numeric_limits<decltype(tmp11())>::infinity();
                            auto tmp14 = static_cast<long>(2*i3);
                            auto tmp15 = tmp14 >= tmp1;
                            auto tmp16 = tmp14 < tmp3;
                            auto tmp17 = tmp15 & tmp16;
                            auto tmp18 = tmp5 & tmp17;
                            auto tmp19 = [&]
                            {
                                auto tmp20 = out_ptr0[(-7168) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp20;
                            }
                            ;
                            auto tmp21 = tmp18 ? tmp19() : -std::numeric_limits<decltype(tmp19())>::infinity();
                            auto tmp22 = (tmp13 != tmp13) ? tmp13 : std::max(tmp21, tmp13);
                            auto tmp23 = static_cast<long>(1 + (2*i3));
                            auto tmp24 = tmp23 >= tmp1;
                            auto tmp25 = tmp23 < tmp3;
                            auto tmp26 = tmp24 & tmp25;
                            auto tmp27 = tmp5 & tmp26;
                            auto tmp28 = [&]
                            {
                                auto tmp29 = out_ptr0[(-7104) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp29;
                            }
                            ;
                            auto tmp30 = tmp27 ? tmp28() : -std::numeric_limits<decltype(tmp28())>::infinity();
                            auto tmp31 = (tmp22 != tmp22) ? tmp22 : std::max(tmp30, tmp22);
                            auto tmp32 = static_cast<long>(2*i2);
                            auto tmp33 = tmp32 >= tmp1;
                            auto tmp34 = tmp32 < tmp3;
                            auto tmp35 = tmp33 & tmp34;
                            auto tmp36 = tmp35 & tmp9;
                            auto tmp37 = [&]
                            {
                                auto tmp38 = out_ptr0[(-64) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp38;
                            }
                            ;
                            auto tmp39 = tmp36 ? tmp37() : -std::numeric_limits<decltype(tmp37())>::infinity();
                            auto tmp40 = (tmp31 != tmp31) ? tmp31 : std::max(tmp39, tmp31);
                            auto tmp41 = tmp35 & tmp17;
                            auto tmp42 = [&]
                            {
                                auto tmp43 = out_ptr0[i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp43;
                            }
                            ;
                            auto tmp44 = tmp41 ? tmp42() : -std::numeric_limits<decltype(tmp42())>::infinity();
                            auto tmp45 = (tmp40 != tmp40) ? tmp40 : std::max(tmp44, tmp40);
                            auto tmp46 = tmp35 & tmp26;
                            auto tmp47 = [&]
                            {
                                auto tmp48 = out_ptr0[64 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp48;
                            }
                            ;
                            auto tmp49 = tmp46 ? tmp47() : -std::numeric_limits<decltype(tmp47())>::infinity();
                            auto tmp50 = (tmp45 != tmp45) ? tmp45 : std::max(tmp49, tmp45);
                            auto tmp51 = static_cast<long>(1 + (2*i2));
                            auto tmp52 = tmp51 >= tmp1;
                            auto tmp53 = tmp51 < tmp3;
                            auto tmp54 = tmp52 & tmp53;
                            auto tmp55 = tmp54 & tmp9;
                            auto tmp56 = [&]
                            {
                                auto tmp57 = out_ptr0[7104 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp57;
                            }
                            ;
                            auto tmp58 = tmp55 ? tmp56() : -std::numeric_limits<decltype(tmp56())>::infinity();
                            auto tmp59 = (tmp50 != tmp50) ? tmp50 : std::max(tmp58, tmp50);
                            auto tmp60 = tmp54 & tmp17;
                            auto tmp61 = [&]
                            {
                                auto tmp62 = out_ptr0[7168 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp62;
                            }
                            ;
                            auto tmp63 = tmp60 ? tmp61() : -std::numeric_limits<decltype(tmp61())>::infinity();
                            auto tmp64 = (tmp59 != tmp59) ? tmp59 : std::max(tmp63, tmp59);
                            auto tmp65 = tmp54 & tmp26;
                            auto tmp66 = [&]
                            {
                                auto tmp67 = out_ptr0[7232 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
                                return tmp67;
                            }
                            ;
                            auto tmp68 = tmp65 ? tmp66() : -std::numeric_limits<decltype(tmp66())>::infinity();
                            auto tmp69 = (tmp64 != tmp64) ? tmp64 : std::max(tmp68, tmp64);
                            out_ptr1[i1 + (64*i3) + (3584*i2) + (200704*i0)] = tmp69;
                        }
                    }
                }
            }
        }
        {
            #pragma omp for
            for(long i0=0; i0<116; i0+=1)
            {
                #pragma GCC ivdep
                for(long i1=0; i1<64; i1+=1)
                {
                    #pragma GCC ivdep
                    for(long i2=0; i2<3136; i2+=1)
                    {
                        auto tmp2 = out_ptr1[i1 + (64*i2) + (200704*i0)];
                        auto tmp3 = in_ptr3[0];
                        auto tmp6 = in_ptr4[0];
                        auto tmp0 = static_cast<float>(0);
                        auto tmp1 = static_cast<float>(127);
                        auto tmp4 = tmp2 / tmp3;
                        auto tmp5 = std::nearbyint(tmp4);
                        auto tmp7 = static_cast<float>(tmp6);
                        auto tmp8 = tmp5 + tmp7;
                        auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8);
                        auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9);
                        auto tmp11 = static_cast<unsigned char>(tmp10);
                        out_ptr2[i2 + (3136*i1) + (200704*i0)] = tmp11;
                    }
                }
            }
        }
    }
}
''')


kernel_cpp_2 = async_compile.cpp('''
#include <ATen/record_function.h>
#include "/tmp/torchinductor_leslie/zt/cztcl2vp5yqlnhofzpqfficjcxgyict6e3xhfdd7sdbkipp4p44x.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
                       const unsigned char* __restrict__ in_ptr0,
                       const long* __restrict__ in_ptr1,
                       const float* __restrict__ in_ptr2,
                       const float* __restrict__ in_ptr3,
                       const long* __restrict__ in_ptr4,
                       unsigned char* __restrict__ out_ptr1)
{
    RECORD_FUNCTION("graph_0_kernel_cpp_2", c10::ArrayRef<c10::IValue>({}));
    auto out_ptr0 = in_out_ptr0;
    #pragma omp parallel num_threads(56)
    {
        {
            #pragma omp for
            for(long i0=0; i0<116; i0+=1)
            {
                #pragma GCC ivdep
                for(long i1=0; i1<2048; i1+=1)
                {
                    {
                        float tmp7 = 0;
                        for(long i2=0; i2<49; i2+=1)
                        {
                            auto tmp0 = in_ptr0[i1 + (2048*i2) + (100352*i0)];
                            auto tmp2 = in_ptr1[0];
                            auto tmp5 = in_ptr2[0];
                            auto tmp1 = static_cast<float>(tmp0);
                            auto tmp3 = static_cast<float>(tmp2);
                            auto tmp4 = tmp1 - tmp3;
                            auto tmp6 = tmp4 * tmp5;
                            tmp7 += tmp6;
                        }
                        out_ptr0[i1 + (2048*i0)] = tmp7;
                    }
                }
            }
        }
        {
            #pragma omp for
            for(long i0=0; i0<237568; i0+=1)
            {
                auto tmp0 = out_ptr0[i0];
                auto tmp5 = in_ptr3[0];
                auto tmp8 = in_ptr4[0];
                auto tmp1 = static_cast<float>(49);
                auto tmp2 = tmp0 / tmp1;
                auto tmp3 = static_cast<float>(0);
                auto tmp4 = static_cast<float>(127);
                auto tmp6 = tmp2 / tmp5;
                auto tmp7 = std::nearbyint(tmp6);
                auto tmp9 = static_cast<float>(tmp8);
                auto tmp10 = tmp7 + tmp9;
                auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::min(tmp4, tmp10);
                auto tmp12 = (tmp11 != tmp11) ? tmp11 : std::max(tmp3, tmp11);
                auto tmp13 = static_cast<unsigned char>(tmp12);
                out_ptr1[i0] = tmp13;
            }
        }
    }
}
''')


kernel_cpp_3 = async_compile.cpp('''
#include <ATen/record_function.h>
#include "/tmp/torchinductor_leslie/zt/cztcl2vp5yqlnhofzpqfficjcxgyict6e3xhfdd7sdbkipp4p44x.h"
extern "C" void kernel(const unsigned char* __restrict__ in_ptr0,
                       const long* __restrict__ in_ptr1,
                       const float* __restrict__ in_ptr2,
                       float* __restrict__ out_ptr0)
{
    RECORD_FUNCTION("graph_0_kernel_cpp_3", c10::ArrayRef<c10::IValue>({}));
    {
        #pragma GCC ivdep
        for(long i0=0; i0<116000; i0+=1)
        {
            auto tmp0 = in_ptr0[i0];
            auto tmp2 = in_ptr1[0];
            auto tmp5 = in_ptr2[0];
            auto tmp1 = static_cast<float>(tmp0);
            auto tmp3 = static_cast<float>(tmp2);
            auto tmp4 = tmp1 - tmp3;
            auto tmp6 = tmp4 * tmp5;
            out_ptr0[i0] = tmp6;
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1, arg268_1, arg269_1, arg270_1, arg271_1, arg272_1, arg273_1, arg274_1, arg275_1, arg276_1, arg277_1, arg278_1, arg279_1, arg280_1, arg281_1, arg282_1, arg283_1, arg284_1, arg285_1, arg286_1, arg287_1, arg288_1, arg289_1, arg290_1, arg291_1, arg292_1, arg293_1, arg294_1, arg295_1, arg296_1, arg297_1, arg298_1, arg299_1, arg300_1, arg301_1, arg302_1, arg303_1, arg304_1, arg305_1, arg306_1, arg307_1, arg308_1, arg309_1, arg310_1, arg311_1, arg312_1, arg313_1, arg314_1, arg315_1, arg316_1, arg317_1, arg318_1, arg319_1, arg320_1, arg321_1, arg322_1, arg323_1, arg324_1, arg325_1, arg326_1, arg327_1, arg328_1, arg329_1, arg330_1 = args
    args.clear()
    buf0 = empty_strided((116, 3, 224, 224), (150528, 1, 672, 3), device='cpu', dtype=torch.uint8)
    kernel_cpp_0(c_void_p(arg330_1.data_ptr()), c_void_p(arg0_1.data_ptr()), c_void_p(arg1_1.data_ptr()), c_void_p(buf0.data_ptr()))
    del arg330_1
    buf1 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf0, arg0_1, arg1_1, arg168_1, arg276_1, arg2_1, 0, arg169_1, [2, 2], [3, 3], [1, 1], 1, arg3_1, arg4_1, 'relu_')
    del arg0_1
    del arg168_1
    del arg169_1
    del arg1_1
    del arg276_1
    del arg2_1
    del buf0
    buf2 = buf1
    assert_size_stride(buf2, (116, 64, 112, 112), (802816, 1, 7168, 64))
    del buf1
    buf3 = empty_strided((116, 64, 112, 112), (802816, 1, 7168, 64), device='cpu', dtype=torch.float32)
    buf4 = empty_strided((116, 64, 56, 56), (200704, 1, 3584, 64), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((116, 64, 56, 56), (200704, 3136, 56, 1), device='cpu', dtype=torch.uint8)
    kernel_cpp_1(c_void_p(buf2.data_ptr()), c_void_p(arg4_1.data_ptr()), c_void_p(arg3_1.data_ptr()), c_void_p(arg5_1.data_ptr()), c_void_p(arg6_1.data_ptr()), c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()))
    del arg3_1
    del arg4_1
    del buf2
    del buf3
    del buf4
    buf6 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf5, arg5_1, arg6_1, arg170_1, arg277_1, arg7_1, 0, arg171_1, [1, 1], [0, 0], [1, 1], 1, arg8_1, arg9_1, 'relu_')
    del arg170_1
    del arg171_1
    del arg277_1
    del arg7_1
    buf7 = buf6
    assert_size_stride(buf7, (116, 64, 56, 56), (200704, 1, 3584, 64))
    del buf6
    buf8 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf7, arg8_1, arg9_1, arg172_1, arg278_1, arg10_1, 0, arg173_1, [1, 1], [1, 1], [1, 1], 1, arg11_1, arg12_1, 'relu_')
    del arg10_1
    del arg172_1
    del arg173_1
    del arg278_1
    del arg8_1
    del arg9_1
    del buf7
    buf9 = buf8
    assert_size_stride(buf9, (116, 64, 56, 56), (200704, 1, 3584, 64))
    del buf8
    buf10 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf5, arg5_1, arg6_1, arg174_1, arg279_1, arg14_1, 0, arg175_1, [1, 1], [0, 0], [1, 1], 1, arg15_1, arg16_1, 'none')
    del arg14_1
    del arg174_1
    del arg175_1
    del arg279_1
    del arg5_1
    del arg6_1
    del buf5
    buf11 = buf10
    assert_size_stride(buf11, (116, 256, 56, 56), (802816, 1, 14336, 256))
    del buf10
    buf12 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf9, arg11_1, arg12_1, buf11, arg15_1, arg16_1, arg176_1, arg280_1, arg13_1, 0, arg177_1, [1, 1], [0, 0], [1, 1], 1, arg17_1, arg18_1, 'add__relu_')
    del arg11_1
    del arg12_1
    del arg13_1
    del arg15_1
    del arg16_1
    del arg176_1
    del arg177_1
    del arg280_1
    del buf11
    del buf9
    buf13 = buf12
    assert_size_stride(buf13, (116, 256, 56, 56), (802816, 1, 14336, 256))
    del buf12
    buf14 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf13, arg17_1, arg18_1, arg178_1, arg281_1, arg19_1, 0, arg179_1, [1, 1], [0, 0], [1, 1], 1, arg20_1, arg21_1, 'relu_')
    del arg178_1
    del arg179_1
    del arg19_1
    del arg281_1
    buf15 = buf14
    assert_size_stride(buf15, (116, 64, 56, 56), (200704, 1, 3584, 64))
    del buf14
    buf16 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf15, arg20_1, arg21_1, arg180_1, arg282_1, arg22_1, 0, arg181_1, [1, 1], [1, 1], [1, 1], 1, arg23_1, arg24_1, 'relu_')
    del arg180_1
    del arg181_1
    del arg20_1
    del arg21_1
    del arg22_1
    del arg282_1
    del buf15
    buf17 = buf16
    assert_size_stride(buf17, (116, 64, 56, 56), (200704, 1, 3584, 64))
    del buf16
    buf18 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf17, arg23_1, arg24_1, buf13, arg17_1, arg18_1, arg182_1, arg283_1, arg25_1, 0, arg183_1, [1, 1], [0, 0], [1, 1], 1, arg26_1, arg27_1, 'add__relu_')
    del arg17_1
    del arg182_1
    del arg183_1
    del arg18_1
    del arg23_1
    del arg24_1
    del arg25_1
    del arg283_1
    del buf13
    del buf17
    buf19 = buf18
    assert_size_stride(buf19, (116, 256, 56, 56), (802816, 1, 14336, 256))
    del buf18
    buf20 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf19, arg26_1, arg27_1, arg184_1, arg284_1, arg28_1, 0, arg185_1, [1, 1], [0, 0], [1, 1], 1, arg29_1, arg30_1, 'relu_')
    del arg184_1
    del arg185_1
    del arg284_1
    del arg28_1
    buf21 = buf20
    assert_size_stride(buf21, (116, 64, 56, 56), (200704, 1, 3584, 64))
    del buf20
    buf22 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf21, arg29_1, arg30_1, arg186_1, arg285_1, arg31_1, 0, arg187_1, [1, 1], [1, 1], [1, 1], 1, arg32_1, arg33_1, 'relu_')
    del arg186_1
    del arg187_1
    del arg285_1
    del arg29_1
    del arg30_1
    del arg31_1
    del buf21
    buf23 = buf22
    assert_size_stride(buf23, (116, 64, 56, 56), (200704, 1, 3584, 64))
    del buf22
    buf24 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf23, arg32_1, arg33_1, buf19, arg26_1, arg27_1, arg188_1, arg286_1, arg34_1, 0, arg189_1, [1, 1], [0, 0], [1, 1], 1, arg35_1, arg36_1, 'add__relu_')
    del arg188_1
    del arg189_1
    del arg26_1
    del arg27_1
    del arg286_1
    del arg32_1
    del arg33_1
    del arg34_1
    del buf19
    del buf23
    buf25 = buf24
    assert_size_stride(buf25, (116, 256, 56, 56), (802816, 1, 14336, 256))
    del buf24
    buf26 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf25, arg35_1, arg36_1, arg190_1, arg287_1, arg37_1, 0, arg191_1, [1, 1], [0, 0], [1, 1], 1, arg38_1, arg39_1, 'relu_')
    del arg190_1
    del arg191_1
    del arg287_1
    del arg37_1
    buf27 = buf26
    assert_size_stride(buf27, (116, 128, 56, 56), (401408, 1, 7168, 128))
    del buf26
    buf28 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf27, arg38_1, arg39_1, arg192_1, arg288_1, arg40_1, 0, arg193_1, [2, 2], [1, 1], [1, 1], 1, arg41_1, arg42_1, 'relu_')
    del arg192_1
    del arg193_1
    del arg288_1
    del arg38_1
    del arg39_1
    del arg40_1
    del buf27
    buf29 = buf28
    assert_size_stride(buf29, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf28
    buf30 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf25, arg35_1, arg36_1, arg194_1, arg289_1, arg44_1, 0, arg195_1, [2, 2], [0, 0], [1, 1], 1, arg45_1, arg46_1, 'none')
    del arg194_1
    del arg195_1
    del arg289_1
    del arg35_1
    del arg36_1
    del arg44_1
    del buf25
    buf31 = buf30
    assert_size_stride(buf31, (116, 512, 28, 28), (401408, 1, 14336, 512))
    del buf30
    buf32 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf29, arg41_1, arg42_1, buf31, arg45_1, arg46_1, arg196_1, arg290_1, arg43_1, 0, arg197_1, [1, 1], [0, 0], [1, 1], 1, arg47_1, arg48_1, 'add__relu_')
    del arg196_1
    del arg197_1
    del arg290_1
    del arg41_1
    del arg42_1
    del arg43_1
    del arg45_1
    del arg46_1
    del buf29
    del buf31
    buf33 = buf32
    assert_size_stride(buf33, (116, 512, 28, 28), (401408, 1, 14336, 512))
    del buf32
    buf34 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf33, arg47_1, arg48_1, arg198_1, arg291_1, arg49_1, 0, arg199_1, [1, 1], [0, 0], [1, 1], 1, arg50_1, arg51_1, 'relu_')
    del arg198_1
    del arg199_1
    del arg291_1
    del arg49_1
    buf35 = buf34
    assert_size_stride(buf35, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf34
    buf36 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf35, arg50_1, arg51_1, arg200_1, arg292_1, arg52_1, 0, arg201_1, [1, 1], [1, 1], [1, 1], 1, arg53_1, arg54_1, 'relu_')
    del arg200_1
    del arg201_1
    del arg292_1
    del arg50_1
    del arg51_1
    del arg52_1
    del buf35
    buf37 = buf36
    assert_size_stride(buf37, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf36
    buf38 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf37, arg53_1, arg54_1, buf33, arg47_1, arg48_1, arg202_1, arg293_1, arg55_1, 0, arg203_1, [1, 1], [0, 0], [1, 1], 1, arg56_1, arg57_1, 'add__relu_')
    del arg202_1
    del arg203_1
    del arg293_1
    del arg47_1
    del arg48_1
    del arg53_1
    del arg54_1
    del arg55_1
    del buf33
    del buf37
    buf39 = buf38
    assert_size_stride(buf39, (116, 512, 28, 28), (401408, 1, 14336, 512))
    del buf38
    buf40 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf39, arg56_1, arg57_1, arg204_1, arg294_1, arg58_1, 0, arg205_1, [1, 1], [0, 0], [1, 1], 1, arg59_1, arg60_1, 'relu_')
    del arg204_1
    del arg205_1
    del arg294_1
    del arg58_1
    buf41 = buf40
    assert_size_stride(buf41, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf40
    buf42 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf41, arg59_1, arg60_1, arg206_1, arg295_1, arg61_1, 0, arg207_1, [1, 1], [1, 1], [1, 1], 1, arg62_1, arg63_1, 'relu_')
    del arg206_1
    del arg207_1
    del arg295_1
    del arg59_1
    del arg60_1
    del arg61_1
    del buf41
    buf43 = buf42
    assert_size_stride(buf43, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf42
    buf44 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf43, arg62_1, arg63_1, buf39, arg56_1, arg57_1, arg208_1, arg296_1, arg64_1, 0, arg209_1, [1, 1], [0, 0], [1, 1], 1, arg65_1, arg66_1, 'add__relu_')
    del arg208_1
    del arg209_1
    del arg296_1
    del arg56_1
    del arg57_1
    del arg62_1
    del arg63_1
    del arg64_1
    del buf39
    del buf43
    buf45 = buf44
    assert_size_stride(buf45, (116, 512, 28, 28), (401408, 1, 14336, 512))
    del buf44
    buf46 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf45, arg65_1, arg66_1, arg210_1, arg297_1, arg67_1, 0, arg211_1, [1, 1], [0, 0], [1, 1], 1, arg68_1, arg69_1, 'relu_')
    del arg210_1
    del arg211_1
    del arg297_1
    del arg67_1
    buf47 = buf46
    assert_size_stride(buf47, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf46
    buf48 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf47, arg68_1, arg69_1, arg212_1, arg298_1, arg70_1, 0, arg213_1, [1, 1], [1, 1], [1, 1], 1, arg71_1, arg72_1, 'relu_')
    del arg212_1
    del arg213_1
    del arg298_1
    del arg68_1
    del arg69_1
    del arg70_1
    del buf47
    buf49 = buf48
    assert_size_stride(buf49, (116, 128, 28, 28), (100352, 1, 3584, 128))
    del buf48
    buf50 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf49, arg71_1, arg72_1, buf45, arg65_1, arg66_1, arg214_1, arg299_1, arg73_1, 0, arg215_1, [1, 1], [0, 0], [1, 1], 1, arg74_1, arg75_1, 'add__relu_')
    del arg214_1
    del arg215_1
    del arg299_1
    del arg65_1
    del arg66_1
    del arg71_1
    del arg72_1
    del arg73_1
    del buf45
    del buf49
    buf51 = buf50
    assert_size_stride(buf51, (116, 512, 28, 28), (401408, 1, 14336, 512))
    del buf50
    buf52 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf51, arg74_1, arg75_1, arg216_1, arg300_1, arg76_1, 0, arg217_1, [1, 1], [0, 0], [1, 1], 1, arg77_1, arg78_1, 'relu_')
    del arg216_1
    del arg217_1
    del arg300_1
    del arg76_1
    buf53 = buf52
    assert_size_stride(buf53, (116, 256, 28, 28), (200704, 1, 7168, 256))
    del buf52
    buf54 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf53, arg77_1, arg78_1, arg218_1, arg301_1, arg79_1, 0, arg219_1, [2, 2], [1, 1], [1, 1], 1, arg80_1, arg81_1, 'relu_')
    del arg218_1
    del arg219_1
    del arg301_1
    del arg77_1
    del arg78_1
    del arg79_1
    del buf53
    buf55 = buf54
    assert_size_stride(buf55, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf54
    buf56 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf51, arg74_1, arg75_1, arg220_1, arg302_1, arg83_1, 0, arg221_1, [2, 2], [0, 0], [1, 1], 1, arg84_1, arg85_1, 'none')
    del arg220_1
    del arg221_1
    del arg302_1
    del arg74_1
    del arg75_1
    del arg83_1
    del buf51
    buf57 = buf56
    assert_size_stride(buf57, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf56
    buf58 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf55, arg80_1, arg81_1, buf57, arg84_1, arg85_1, arg222_1, arg303_1, arg82_1, 0, arg223_1, [1, 1], [0, 0], [1, 1], 1, arg86_1, arg87_1, 'add__relu_')
    del arg222_1
    del arg223_1
    del arg303_1
    del arg80_1
    del arg81_1
    del arg82_1
    del arg84_1
    del arg85_1
    del buf55
    del buf57
    buf59 = buf58
    assert_size_stride(buf59, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf58
    buf60 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf59, arg86_1, arg87_1, arg224_1, arg304_1, arg88_1, 0, arg225_1, [1, 1], [0, 0], [1, 1], 1, arg89_1, arg90_1, 'relu_')
    del arg224_1
    del arg225_1
    del arg304_1
    del arg88_1
    buf61 = buf60
    assert_size_stride(buf61, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf60
    buf62 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf61, arg89_1, arg90_1, arg226_1, arg305_1, arg91_1, 0, arg227_1, [1, 1], [1, 1], [1, 1], 1, arg92_1, arg93_1, 'relu_')
    del arg226_1
    del arg227_1
    del arg305_1
    del arg89_1
    del arg90_1
    del arg91_1
    del buf61
    buf63 = buf62
    assert_size_stride(buf63, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf62
    buf64 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf63, arg92_1, arg93_1, buf59, arg86_1, arg87_1, arg228_1, arg306_1, arg94_1, 0, arg229_1, [1, 1], [0, 0], [1, 1], 1, arg95_1, arg96_1, 'add__relu_')
    del arg228_1
    del arg229_1
    del arg306_1
    del arg86_1
    del arg87_1
    del arg92_1
    del arg93_1
    del arg94_1
    del buf59
    del buf63
    buf65 = buf64
    assert_size_stride(buf65, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf64
    buf66 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf65, arg95_1, arg96_1, arg230_1, arg307_1, arg97_1, 0, arg231_1, [1, 1], [0, 0], [1, 1], 1, arg98_1, arg99_1, 'relu_')
    del arg230_1
    del arg231_1
    del arg307_1
    del arg97_1
    buf67 = buf66
    assert_size_stride(buf67, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf66
    buf68 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf67, arg98_1, arg99_1, arg232_1, arg308_1, arg100_1, 0, arg233_1, [1, 1], [1, 1], [1, 1], 1, arg101_1, arg102_1, 'relu_')
    del arg100_1
    del arg232_1
    del arg233_1
    del arg308_1
    del arg98_1
    del arg99_1
    del buf67
    buf69 = buf68
    assert_size_stride(buf69, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf68
    buf70 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf69, arg101_1, arg102_1, buf65, arg95_1, arg96_1, arg234_1, arg309_1, arg103_1, 0, arg235_1, [1, 1], [0, 0], [1, 1], 1, arg104_1, arg105_1, 'add__relu_')
    del arg101_1
    del arg102_1
    del arg103_1
    del arg234_1
    del arg235_1
    del arg309_1
    del arg95_1
    del arg96_1
    del buf65
    del buf69
    buf71 = buf70
    assert_size_stride(buf71, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf70
    buf72 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf71, arg104_1, arg105_1, arg236_1, arg310_1, arg106_1, 0, arg237_1, [1, 1], [0, 0], [1, 1], 1, arg107_1, arg108_1, 'relu_')
    del arg106_1
    del arg236_1
    del arg237_1
    del arg310_1
    buf73 = buf72
    assert_size_stride(buf73, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf72
    buf74 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf73, arg107_1, arg108_1, arg238_1, arg311_1, arg109_1, 0, arg239_1, [1, 1], [1, 1], [1, 1], 1, arg110_1, arg111_1, 'relu_')
    del arg107_1
    del arg108_1
    del arg109_1
    del arg238_1
    del arg239_1
    del arg311_1
    del buf73
    buf75 = buf74
    assert_size_stride(buf75, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf74
    buf76 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf75, arg110_1, arg111_1, buf71, arg104_1, arg105_1, arg240_1, arg312_1, arg112_1, 0, arg241_1, [1, 1], [0, 0], [1, 1], 1, arg113_1, arg114_1, 'add__relu_')
    del arg104_1
    del arg105_1
    del arg110_1
    del arg111_1
    del arg112_1
    del arg240_1
    del arg241_1
    del arg312_1
    del buf71
    del buf75
    buf77 = buf76
    assert_size_stride(buf77, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf76
    buf78 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf77, arg113_1, arg114_1, arg242_1, arg313_1, arg115_1, 0, arg243_1, [1, 1], [0, 0], [1, 1], 1, arg116_1, arg117_1, 'relu_')
    del arg115_1
    del arg242_1
    del arg243_1
    del arg313_1
    buf79 = buf78
    assert_size_stride(buf79, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf78
    buf80 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf79, arg116_1, arg117_1, arg244_1, arg314_1, arg118_1, 0, arg245_1, [1, 1], [1, 1], [1, 1], 1, arg119_1, arg120_1, 'relu_')
    del arg116_1
    del arg117_1
    del arg118_1
    del arg244_1
    del arg245_1
    del arg314_1
    del buf79
    buf81 = buf80
    assert_size_stride(buf81, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf80
    buf82 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf81, arg119_1, arg120_1, buf77, arg113_1, arg114_1, arg246_1, arg315_1, arg121_1, 0, arg247_1, [1, 1], [0, 0], [1, 1], 1, arg122_1, arg123_1, 'add__relu_')
    del arg113_1
    del arg114_1
    del arg119_1
    del arg120_1
    del arg121_1
    del arg246_1
    del arg247_1
    del arg315_1
    del buf77
    del buf81
    buf83 = buf82
    assert_size_stride(buf83, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf82
    buf84 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf83, arg122_1, arg123_1, arg248_1, arg316_1, arg124_1, 0, arg249_1, [1, 1], [0, 0], [1, 1], 1, arg125_1, arg126_1, 'relu_')
    del arg124_1
    del arg248_1
    del arg249_1
    del arg316_1
    buf85 = buf84
    assert_size_stride(buf85, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf84
    buf86 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf85, arg125_1, arg126_1, arg250_1, arg317_1, arg127_1, 0, arg251_1, [1, 1], [1, 1], [1, 1], 1, arg128_1, arg129_1, 'relu_')
    del arg125_1
    del arg126_1
    del arg127_1
    del arg250_1
    del arg251_1
    del arg317_1
    del buf85
    buf87 = buf86
    assert_size_stride(buf87, (116, 256, 14, 14), (50176, 1, 3584, 256))
    del buf86
    buf88 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf87, arg128_1, arg129_1, buf83, arg122_1, arg123_1, arg252_1, arg318_1, arg130_1, 0, arg253_1, [1, 1], [0, 0], [1, 1], 1, arg131_1, arg132_1, 'add__relu_')
    del arg122_1
    del arg123_1
    del arg128_1
    del arg129_1
    del arg130_1
    del arg252_1
    del arg253_1
    del arg318_1
    del buf83
    del buf87
    buf89 = buf88
    assert_size_stride(buf89, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
    del buf88
    buf90 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf89, arg131_1, arg132_1, arg254_1, arg319_1, arg133_1, 0, arg255_1, [1, 1], [0, 0], [1, 1], 1, arg134_1, arg135_1, 'relu_')
    del arg133_1
    del arg254_1
    del arg255_1
    del arg319_1
    buf91 = buf90
    assert_size_stride(buf91, (116, 512, 14, 14), (100352, 1, 7168, 512))
    del buf90
    buf92 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf91, arg134_1, arg135_1, arg256_1, arg320_1, arg136_1, 0, arg257_1, [2, 2], [1, 1], [1, 1], 1, arg137_1, arg138_1, 'relu_')
    del arg134_1
    del arg135_1
    del arg136_1
    del arg256_1
    del arg257_1
    del arg320_1
    del buf91
    buf93 = buf92
    assert_size_stride(buf93, (116, 512, 7, 7), (25088, 1, 3584, 512))
    del buf92
    buf94 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf89, arg131_1, arg132_1, arg258_1, arg321_1, arg140_1, 0, arg259_1, [2, 2], [0, 0], [1, 1], 1, arg141_1, arg142_1, 'none')
    del arg131_1
    del arg132_1
    del arg140_1
    del arg258_1
    del arg259_1
    del arg321_1
    del buf89
    buf95 = buf94
    assert_size_stride(buf95, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
    del buf94
    buf96 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf93, arg137_1, arg138_1, buf95, arg141_1, arg142_1, arg260_1, arg322_1, arg139_1, 0, arg261_1, [1, 1], [0, 0], [1, 1], 1, arg143_1, arg144_1, 'add__relu_')
    del arg137_1
    del arg138_1
    del arg139_1
    del arg141_1
    del arg142_1
    del arg260_1
    del arg261_1
    del arg322_1
    del buf93
    del buf95
    buf97 = buf96
    assert_size_stride(buf97, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
    del buf96
    buf98 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf97, arg143_1, arg144_1, arg262_1, arg323_1, arg145_1, 0, arg263_1, [1, 1], [0, 0], [1, 1], 1, arg146_1, arg147_1, 'relu_')
    del arg145_1
    del arg262_1
    del arg263_1
    del arg323_1
    buf99 = buf98
    assert_size_stride(buf99, (116, 512, 7, 7), (25088, 1, 3584, 512))
    del buf98
    buf100 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf99, arg146_1, arg147_1, arg264_1, arg324_1, arg148_1, 0, arg265_1, [1, 1], [1, 1], [1, 1], 1, arg149_1, arg150_1, 'relu_')
    del arg146_1
    del arg147_1
    del arg148_1
    del arg264_1
    del arg265_1
    del arg324_1
    del buf99
    buf101 = buf100
    assert_size_stride(buf101, (116, 512, 7, 7), (25088, 1, 3584, 512))
    del buf100
    buf102 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf101, arg149_1, arg150_1, buf97, arg143_1, arg144_1, arg266_1, arg325_1, arg151_1, 0, arg267_1, [1, 1], [0, 0], [1, 1], 1, arg152_1, arg153_1, 'add__relu_')
    del arg143_1
    del arg144_1
    del arg149_1
    del arg150_1
    del arg151_1
    del arg266_1
    del arg267_1
    del arg325_1
    del buf101
    del buf97
    buf103 = buf102
    assert_size_stride(buf103, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
    del buf102
    buf104 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf103, arg152_1, arg153_1, arg268_1, arg326_1, arg154_1, 0, arg269_1, [1, 1], [0, 0], [1, 1], 1, arg155_1, arg156_1, 'relu_')
    del arg154_1
    del arg268_1
    del arg269_1
    del arg326_1
    buf105 = buf104
    assert_size_stride(buf105, (116, 512, 7, 7), (25088, 1, 3584, 512))
    del buf104
    buf106 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf105, arg155_1, arg156_1, arg270_1, arg327_1, arg157_1, 0, arg271_1, [1, 1], [1, 1], [1, 1], 1, arg158_1, arg159_1, 'relu_')
    del arg155_1
    del arg156_1
    del arg157_1
    del arg270_1
    del arg271_1
    del arg327_1
    del buf105
    buf107 = buf106
    assert_size_stride(buf107, (116, 512, 7, 7), (25088, 1, 3584, 512))
    del buf106
    buf108 = torch.ops.quantized_decomposed.conv_binary_inductor.tensor(buf107, arg158_1, arg159_1, buf103, arg152_1, arg153_1, arg272_1, arg328_1, arg160_1, 0, arg273_1, [1, 1], [0, 0], [1, 1], 1, arg161_1, arg162_1, 'add__relu_')
    del arg152_1
    del arg153_1
    del arg158_1
    del arg159_1
    del arg160_1
    del arg272_1
    del arg273_1
    del arg328_1
    del buf103
    del buf107
    buf109 = buf108
    assert_size_stride(buf109, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
    del buf108
    buf110 = empty_strided((116, 2048, 1, 1), (2048, 1, 237568, 237568), device='cpu', dtype=torch.float32)
    buf111 = as_strided(buf110, (116, 2048, 1, 1), (2048, 1, 1, 1)); del buf110  # reuse
    buf112 = empty_strided((116, 2048), (2048, 1), device='cpu', dtype=torch.uint8)
    kernel_cpp_2(c_void_p(buf111.data_ptr()), c_void_p(buf109.data_ptr()), c_void_p(arg162_1.data_ptr()), c_void_p(arg161_1.data_ptr()), c_void_p(arg163_1.data_ptr()), c_void_p(arg164_1.data_ptr()), c_void_p(buf112.data_ptr()))
    del arg161_1
    del arg162_1
    del buf109
    del buf111
    buf113 = torch.ops.quantized_decomposed.linear_unary_inductor.tensor(buf112, arg163_1, arg164_1, arg274_1, arg329_1, arg165_1, 0, arg275_1, arg166_1, arg167_1, 'none')
    del arg163_1
    del arg164_1
    del arg165_1
    del arg274_1
    del arg275_1
    del arg329_1
    del buf112
    buf114 = buf113
    assert_size_stride(buf114, (116, 1000), (1000, 1))
    del buf113
    buf115 = empty_strided((116, 1000), (1000, 1), device='cpu', dtype=torch.float32)
    kernel_cpp_3(c_void_p(buf114.data_ptr()), c_void_p(arg167_1.data_ptr()), c_void_p(arg166_1.data_ptr()), c_void_p(buf115.data_ptr()))
    del arg166_1
    del arg167_1
    return (buf115, )


if __name__ == "__main__":
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    arg0_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg1_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg2_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg3_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg4_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg5_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg6_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg7_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg8_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg9_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg10_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg11_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg12_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg13_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg14_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg15_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg16_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg17_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg18_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg19_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg20_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg21_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg22_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg23_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg24_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg25_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg26_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg27_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg28_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg29_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg30_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg31_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
    arg32_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg33_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg34_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg35_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg36_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg37_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg38_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg39_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg40_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg41_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg42_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg43_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg44_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg45_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg46_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg47_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg48_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg49_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg50_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg51_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg52_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg53_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg54_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg55_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg56_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg57_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg58_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg59_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg60_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg61_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg62_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg63_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg64_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg65_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg66_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg67_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg68_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg69_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg70_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
    arg71_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg72_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg73_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg74_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg75_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg76_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg77_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg78_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg79_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg80_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg81_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg82_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg83_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg84_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg85_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg86_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg87_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg88_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg89_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg90_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg91_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg92_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg93_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg94_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg95_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg96_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg97_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg98_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg99_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg100_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg101_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg102_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg103_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg104_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg105_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg106_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg107_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg108_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg109_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg110_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg111_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg112_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg113_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg114_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg115_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg116_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg117_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg118_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg119_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg120_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg121_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg122_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg123_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg124_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg125_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg126_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg127_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
    arg128_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg129_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg130_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
    arg131_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg132_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg133_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg134_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg135_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg136_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg137_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg138_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg139_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
    arg140_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
    arg141_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg142_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg143_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg144_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg145_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg146_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg147_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg148_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg149_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg150_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg151_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
    arg152_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg153_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg154_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg155_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg156_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg157_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
    arg158_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg159_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg160_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
    arg161_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg162_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg163_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg164_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg165_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.int64)
    arg166_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
    arg167_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
    arg168_1 = rand_strided((64, 3, 7, 7), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg169_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg170_1 = rand_strided((64, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg171_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg172_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg173_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg174_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg175_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg176_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg177_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg178_1 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg179_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg180_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg181_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg182_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg183_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg184_1 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg185_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg186_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg187_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg188_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg189_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg190_1 = rand_strided((128, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg191_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg192_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg193_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg194_1 = rand_strided((512, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg195_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg196_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg197_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg198_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg199_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg200_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg201_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg202_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg203_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg204_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg205_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg206_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg207_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg208_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg209_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg210_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg211_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg212_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg213_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg214_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg215_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg216_1 = rand_strided((256, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg217_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg218_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg219_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg220_1 = rand_strided((1024, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg221_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg222_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg223_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg224_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg225_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg226_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg227_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg228_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg229_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg230_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg231_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg232_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg233_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg234_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg235_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg236_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg237_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg238_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg239_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg240_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg241_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg242_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg243_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg244_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg245_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg246_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg247_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg248_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg249_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg250_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg251_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg252_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg253_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg254_1 = rand_strided((512, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg255_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg256_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg257_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg258_1 = rand_strided((2048, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg259_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg260_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg261_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg262_1 = rand_strided((512, 2048, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg263_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg264_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg265_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg266_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg267_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg268_1 = rand_strided((512, 2048, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg269_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg270_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg271_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg272_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
    arg273_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg274_1 = rand_strided((2048, 1000), (1, 0), device='cpu', dtype=torch.int8)
    arg275_1 = rand_strided((1, 1000), (1, 0), device='cpu', dtype=torch.float32)
    arg276_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg277_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg278_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg279_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg280_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg281_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg282_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg283_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg284_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg285_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    arg286_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg287_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg288_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg289_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg290_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg291_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg292_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg293_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg294_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg295_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg296_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg297_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg298_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    arg299_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg300_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg301_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg302_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg303_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg304_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg305_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg306_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg307_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg308_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg309_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg310_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg311_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg312_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg313_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg314_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg315_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg316_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg317_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    arg318_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    arg319_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg320_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg321_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg322_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg323_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg324_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg325_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg326_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg327_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    arg328_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
    arg329_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.float32)
    arg330_1 = rand_strided((116, 3, 224, 224), (150528, 1, 672, 3), device='cpu', dtype=torch.float32)
    print_performance(lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1, arg268_1, arg269_1, arg270_1, arg271_1, arg272_1, arg273_1, arg274_1, arg275_1, arg276_1, arg277_1, arg278_1, arg279_1, arg280_1, arg281_1, arg282_1, arg283_1, arg284_1, arg285_1, arg286_1, arg287_1, arg288_1, arg289_1, arg290_1, arg291_1, arg292_1, arg293_1, arg294_1, arg295_1, arg296_1, arg297_1, arg298_1, arg299_1, arg300_1, arg301_1, arg302_1, arg303_1, arg304_1, arg305_1, arg306_1, arg307_1, arg308_1, arg309_1, arg310_1, arg311_1, arg312_1, arg313_1, arg314_1, arg315_1, arg316_1, arg317_1, arg318_1, arg319_1, arg320_1, arg321_1, arg322_1, arg323_1, arg324_1, arg325_1, arg326_1, arg327_1, arg328_1, arg329_1, arg330_1]))