Created
August 25, 2021 15:21
-
-
Save robieta/38b6306d47f4c77e3b70dd010c11f787 to your computer and use it in GitHub Desktop.
Codegen Debug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[DUMP kernel.cpp:2913] TensorExprKernel graph: | |
[DUMP kernel.cpp:2913] graph(%0 : Float(1, 72, 112, 112, strides=[903168, 12544, 112, 1], requires_grad=0, device=cuda:0), | |
[DUMP kernel.cpp:2913] %1 : Float(1, 72, 112, 112, strides=[903168, 12544, 112, 1], requires_grad=0, device=cuda:0)): | |
[DUMP kernel.cpp:2913] %self.blocks.0.0.has_residual : bool = prim::Constant[value=1]() | |
[DUMP kernel.cpp:2913] %3 : float = prim::Constant[value=1.0000000000000001e-05]() | |
[DUMP kernel.cpp:2913] %4 : float = prim::Constant[value=0.10000000000000001]() | |
[DUMP kernel.cpp:2913] %self.bn1.training : bool = prim::Constant[value=0]() | |
[DUMP kernel.cpp:2913] %self.blocks.1.0.bn1.running_mean : Float(144, strides=[1], requires_grad=0, device=cuda:0) = prim::Constant[value=<Tensor>]() | |
[DUMP kernel.cpp:2913] %self.blocks.1.0.bn1.running_var : Float(144, strides=[1], requires_grad=0, device=cuda:0) = prim::Constant[value=<Tensor>]() | |
[DUMP kernel.cpp:2913] %8 : int = prim::Constant[value=1]() | |
[DUMP kernel.cpp:2913] %x_out.199 : Tensor[] = prim::ListConstruct(%0, %1) | |
[DUMP kernel.cpp:2913] %x.690 : Float(1, 144, 112, 112, strides=[1806336, 12544, 112, 1], requires_grad=0, device=cuda:0) = aten::cat(%x_out.199, %8) # /home/ec2-user/anaconda3/envs/robieta_debug_env/lib/python3.7/site-packages/timm/models/layers/mixed_conv2d.py:50:12 | |
[DUMP kernel.cpp:2913] %x.686 : Float(1, 144, 112, 112, strides=[1806336, 12544, 112, 1], requires_grad=0, device=cuda:0) = aten::batch_norm(%x.690, %self.blocks.1.0.bn1.running_var, %self.blocks.1.0.bn1.running_mean, %self.blocks.1.0.bn1.running_mean, %self.blocks.1.0.bn1.running_var, %self.bn1.training, %4, %3, %self.blocks.0.0.has_residual) # /home/ec2-user/anaconda3/envs/robieta_debug_env/lib/python3.7/site-packages/torch/nn/functional.py:2282:11 | |
[DUMP kernel.cpp:2913] return (%x.686, %x.690) | |
[DEBUG kernel.cpp:2492] Original Stmt: | |
[DEBUG kernel.cpp:2492] { | |
[DEBUG kernel.cpp:2492] for (int v = 0; v < 1; v++) { | |
[DEBUG kernel.cpp:2492] for (int v_1 = 0; v_1 < 144; v_1++) { | |
[DEBUG kernel.cpp:2492] for (int v_2 = 0; v_2 < 112; v_2++) { | |
[DEBUG kernel.cpp:2492] for (int v_3 = 0; v_3 < 112; v_3++) { | |
[DEBUG kernel.cpp:2492] aten_cat[v, v_1, v_2, v_3] = IfThenElse(v_1<72 ? 1 : 0, t_[0, v_1, v_2, v_3], t__[0, v_1 - 72, v_2, v_3]); | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] for (int v_4 = 0; v_4 < 1; v_4++) { | |
[DEBUG kernel.cpp:2492] for (int v_5 = 0; v_5 < 144; v_5++) { | |
[DEBUG kernel.cpp:2492] for (int v_6 = 0; v_6 < 112; v_6++) { | |
[DEBUG kernel.cpp:2492] for (int v_7 = 0; v_7 < 112; v_7++) { | |
[DEBUG kernel.cpp:2492] aten_batch_norm[v_4, v_5, v_6, v_7] = float(double(aten_cat[0, v_5, v_6, v_7]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5])) + (double(const_self.blocks.1.0.bn1.running_mean[v_5]) - double(const_self.blocks.1.0.bn1.running_mean[v_5]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5])))); | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2492] } | |
[DEBUG kernel.cpp:2514] after inline{ | |
[DEBUG kernel.cpp:2514] for (int v = 0; v < 1; v++) { | |
[DEBUG kernel.cpp:2514] for (int v_1 = 0; v_1 < 144; v_1++) { | |
[DEBUG kernel.cpp:2514] for (int v_2 = 0; v_2 < 112; v_2++) { | |
[DEBUG kernel.cpp:2514] for (int v_3 = 0; v_3 < 112; v_3++) { | |
[DEBUG kernel.cpp:2514] aten_cat[v, v_1, v_2, v_3] = IfThenElse(v_1<72 ? 1 : 0, t_[0, v_1, v_2, v_3], t__[0, v_1 - 72, v_2, v_3]); | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] for (int v_4 = 0; v_4 < 1; v_4++) { | |
[DEBUG kernel.cpp:2514] for (int v_5 = 0; v_5 < 144; v_5++) { | |
[DEBUG kernel.cpp:2514] for (int v_6 = 0; v_6 < 112; v_6++) { | |
[DEBUG kernel.cpp:2514] for (int v_7 = 0; v_7 < 112; v_7++) { | |
[DEBUG kernel.cpp:2514] aten_batch_norm[v_4, v_5, v_6, v_7] = float(double(IfThenElse(v_5<72 ? 1 : 0, t_[0, v_5, v_6, v_7], t__[0, v_5 - 72, v_6, v_7])) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5])) + (double(const_self.blocks.1.0.bn1.running_mean[v_5]) - double(const_self.blocks.1.0.bn1.running_mean[v_5]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5])))); | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2514] } | |
[DEBUG kernel.cpp:2613] Final Stmt: | |
[DEBUG kernel.cpp:2613] { | |
[DEBUG kernel.cpp:2613] for (int _flat_outer = 0; _flat_outer < 3528; _flat_outer++) /* blockIdx.x */{ | |
[DEBUG kernel.cpp:2613] for (int _flat_inner = 0; _flat_inner < 512; _flat_inner++) /* threadIdx.x */{ | |
[DEBUG kernel.cpp:2613] aten_cat[_flat_inner + 512 * _flat_outer] = IfThenElse(((_flat_inner + 512 * _flat_outer) / 12544) % 144<72 ? 1 : 0, t_[(_flat_inner + 512 * _flat_outer) % 1806336], t__[(_flat_inner + 512 * _flat_outer) % 1806336 - 903168]); | |
[DEBUG kernel.cpp:2613] } | |
[DEBUG kernel.cpp:2613] } | |
[DEBUG kernel.cpp:2613] for (int _flat_outer_1 = 0; _flat_outer_1 < 3528; _flat_outer_1++) /* blockIdx.x */{ | |
[DEBUG kernel.cpp:2613] for (int _flat_inner_1 = 0; _flat_inner_1 < 512; _flat_inner_1++) /* threadIdx.x */{ | |
[DEBUG kernel.cpp:2613] aten_batch_norm[_flat_inner_1 + 512 * _flat_outer_1] = float(double(IfThenElse(((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144<72 ? 1 : 0, t_[(_flat_inner_1 + 512 * _flat_outer_1) % 1806336], t__[(_flat_inner_1 + 512 * _flat_outer_1) % 1806336 - 903168])) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144])) + (double(const_self.blocks.1.0.bn1.running_mean[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) - double(const_self.blocks.1.0.bn1.running_mean[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144])))); | |
[DEBUG kernel.cpp:2613] } | |
[DEBUG kernel.cpp:2613] } | |
[DEBUG kernel.cpp:2613] } | |
[DEBUG cuda_codegen.cpp:1056] Fused TE CUDA kernel: | |
[DEBUG cuda_codegen.cpp:1056] | |
[DEBUG cuda_codegen.cpp:1056] #define NAN __int_as_float(0x7fffffff) | |
[DEBUG cuda_codegen.cpp:1056] #define POS_INFINITY __int_as_float(0x7f800000) | |
[DEBUG cuda_codegen.cpp:1056] #define NEG_INFINITY __int_as_float(0xff800000) | |
[DEBUG cuda_codegen.cpp:1056] | |
[DEBUG cuda_codegen.cpp:1056] | |
[DEBUG cuda_codegen.cpp:1056] template<typename T> | |
[DEBUG cuda_codegen.cpp:1056] __device__ T maximum(T a, T b) { | |
[DEBUG cuda_codegen.cpp:1056] return isnan(a) ? a : (a > b ? a : b); | |
[DEBUG cuda_codegen.cpp:1056] } | |
[DEBUG cuda_codegen.cpp:1056] | |
[DEBUG cuda_codegen.cpp:1056] template<typename T> | |
[DEBUG cuda_codegen.cpp:1056] __device__ T minimum(T a, T b) { | |
[DEBUG cuda_codegen.cpp:1056] return isnan(a) ? a : (a < b ? a : b); | |
[DEBUG cuda_codegen.cpp:1056] } | |
[DEBUG cuda_codegen.cpp:1056] | |
[DEBUG cuda_codegen.cpp:1056] extern "C" __global__ | |
[DEBUG cuda_codegen.cpp:1056] void fused_cat_batch_norm(float* t_, float* t__, float* aten_batch_norm, float* aten_cat, float* const_self.blocks.1.0.bn1.running_mean, float* const_self.blocks.1.0.bn1.running_var) { | |
[DEBUG cuda_codegen.cpp:1056] { | |
[DEBUG cuda_codegen.cpp:1056] aten_cat[threadIdx.x + 512 * blockIdx.x] = ((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168)); | |
[DEBUG cuda_codegen.cpp:1056] float const_self.blocks.1.0.bn1.running_var_1 = __ldg(const_self.blocks.1.0.bn1.running_var + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144); | |
[DEBUG cuda_codegen.cpp:1056] float const_self.blocks.1.0.bn1.running_mean_1 = __ldg(const_self.blocks.1.0.bn1.running_mean + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144); | |
[DEBUG cuda_codegen.cpp:1056] aten_batch_norm[threadIdx.x + 512 * blockIdx.x] = (float)((double)(((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168))) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1)) + ((double)(const_self.blocks.1.0.bn1.running_mean_1) - (double)(const_self.blocks.1.0.bn1.running_mean_1) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1)))); | |
[DEBUG cuda_codegen.cpp:1056] } | |
[DEBUG cuda_codegen.cpp:1056] } | |
[DEBUG cuda_codegen.cpp:1056] gpu_block_extents: (3528, 1, 1) | |
[DEBUG cuda_codegen.cpp:1056] gpu_thread_extents: (512, 1, 1) | |
Begin 0 | |
Begin 1 | |
Traceback (most recent call last): | |
File "debug.py", line 13, in <module> | |
jit_model(x) | |
File "/home/ec2-user/anaconda3/envs/robieta_debug_env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1065, in _call_impl | |
return forward_call(*input, **kwargs) | |
RuntimeError: default_program(18): error: expected a ")" | |
default_program(18): error: extra text after expected end of number | |
default_program(18): error: extra text after expected end of number | |
default_program(21): error: expected a ";" | |
default_program(21): error: extra text after expected end of number | |
default_program(21): error: extra text after expected end of number | |
default_program(22): error: "const_self" has already been declared in the current scope | |
default_program(22): error: expected a ";" | |
default_program(22): error: extra text after expected end of number | |
default_program(22): error: extra text after expected end of number | |
default_program(23): error: extra text after expected end of number | |
default_program(23): error: expression must have class type | |
default_program(23): error: expected a ")" | |
default_program(23): error: extra text after expected end of number | |
default_program(23): error: expression must have class type | |
default_program(23): error: expected a ")" | |
default_program(23): error: extra text after expected end of number | |
default_program(23): error: expression must have class type | |
default_program(23): error: expected a ")" | |
default_program(23): error: extra text after expected end of number | |
default_program(23): error: expression must have class type | |
default_program(23): error: expected a ")" | |
default_program(23): error: extra text after expected end of number | |
default_program(23): error: expression must have class type | |
default_program(23): error: expected a ")" | |
default_program(23): error: extra text after expected end of number | |
default_program(23): error: expression must have class type | |
default_program(23): error: expected a ")" | |
default_program(21): warning: variable "const_self" was declared but never referenced | |
28 errors detected in the compilation of "default_program". | |
nvrtc compilation failed: | |
#define NAN __int_as_float(0x7fffffff) | |
#define POS_INFINITY __int_as_float(0x7f800000) | |
#define NEG_INFINITY __int_as_float(0xff800000) | |
template<typename T> | |
__device__ T maximum(T a, T b) { | |
return isnan(a) ? a : (a > b ? a : b); | |
} | |
template<typename T> | |
__device__ T minimum(T a, T b) { | |
return isnan(a) ? a : (a < b ? a : b); | |
} | |
extern "C" __global__ | |
void fused_cat_batch_norm(float* t_, float* t__, float* aten_batch_norm, float* aten_cat, float* const_self.blocks.1.0.bn1.running_mean, float* const_self.blocks.1.0.bn1.running_var) { | |
{ | |
aten_cat[threadIdx.x + 512 * blockIdx.x] = ((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168)); | |
float const_self.blocks.1.0.bn1.running_var_1 = __ldg(const_self.blocks.1.0.bn1.running_var + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144); | |
float const_self.blocks.1.0.bn1.running_mean_1 = __ldg(const_self.blocks.1.0.bn1.running_mean + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144); | |
aten_batch_norm[threadIdx.x + 512 * blockIdx.x] = (float)((double)(((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168))) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1)) + ((double)(const_self.blocks.1.0.bn1.running_mean_1) - (double)(const_self.blocks.1.0.bn1.running_mean_1) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1)))); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment