Skip to content

Instantly share code, notes, and snippets.

@robieta
Created August 25, 2021 15:21
Show Gist options
  • Save robieta/38b6306d47f4c77e3b70dd010c11f787 to your computer and use it in GitHub Desktop.
Save robieta/38b6306d47f4c77e3b70dd010c11f787 to your computer and use it in GitHub Desktop.
Codegen Debug
[DUMP kernel.cpp:2913] TensorExprKernel graph:
[DUMP kernel.cpp:2913] graph(%0 : Float(1, 72, 112, 112, strides=[903168, 12544, 112, 1], requires_grad=0, device=cuda:0),
[DUMP kernel.cpp:2913] %1 : Float(1, 72, 112, 112, strides=[903168, 12544, 112, 1], requires_grad=0, device=cuda:0)):
[DUMP kernel.cpp:2913] %self.blocks.0.0.has_residual : bool = prim::Constant[value=1]()
[DUMP kernel.cpp:2913] %3 : float = prim::Constant[value=1.0000000000000001e-05]()
[DUMP kernel.cpp:2913] %4 : float = prim::Constant[value=0.10000000000000001]()
[DUMP kernel.cpp:2913] %self.bn1.training : bool = prim::Constant[value=0]()
[DUMP kernel.cpp:2913] %self.blocks.1.0.bn1.running_mean : Float(144, strides=[1], requires_grad=0, device=cuda:0) = prim::Constant[value=<Tensor>]()
[DUMP kernel.cpp:2913] %self.blocks.1.0.bn1.running_var : Float(144, strides=[1], requires_grad=0, device=cuda:0) = prim::Constant[value=<Tensor>]()
[DUMP kernel.cpp:2913] %8 : int = prim::Constant[value=1]()
[DUMP kernel.cpp:2913] %x_out.199 : Tensor[] = prim::ListConstruct(%0, %1)
[DUMP kernel.cpp:2913] %x.690 : Float(1, 144, 112, 112, strides=[1806336, 12544, 112, 1], requires_grad=0, device=cuda:0) = aten::cat(%x_out.199, %8) # /home/ec2-user/anaconda3/envs/robieta_debug_env/lib/python3.7/site-packages/timm/models/layers/mixed_conv2d.py:50:12
[DUMP kernel.cpp:2913] %x.686 : Float(1, 144, 112, 112, strides=[1806336, 12544, 112, 1], requires_grad=0, device=cuda:0) = aten::batch_norm(%x.690, %self.blocks.1.0.bn1.running_var, %self.blocks.1.0.bn1.running_mean, %self.blocks.1.0.bn1.running_mean, %self.blocks.1.0.bn1.running_var, %self.bn1.training, %4, %3, %self.blocks.0.0.has_residual) # /home/ec2-user/anaconda3/envs/robieta_debug_env/lib/python3.7/site-packages/torch/nn/functional.py:2282:11
[DUMP kernel.cpp:2913] return (%x.686, %x.690)
[DEBUG kernel.cpp:2492] Original Stmt:
[DEBUG kernel.cpp:2492] {
[DEBUG kernel.cpp:2492] for (int v = 0; v < 1; v++) {
[DEBUG kernel.cpp:2492] for (int v_1 = 0; v_1 < 144; v_1++) {
[DEBUG kernel.cpp:2492] for (int v_2 = 0; v_2 < 112; v_2++) {
[DEBUG kernel.cpp:2492] for (int v_3 = 0; v_3 < 112; v_3++) {
[DEBUG kernel.cpp:2492] aten_cat[v, v_1, v_2, v_3] = IfThenElse(v_1<72 ? 1 : 0, t_[0, v_1, v_2, v_3], t__[0, v_1 - 72, v_2, v_3]);
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] for (int v_4 = 0; v_4 < 1; v_4++) {
[DEBUG kernel.cpp:2492] for (int v_5 = 0; v_5 < 144; v_5++) {
[DEBUG kernel.cpp:2492] for (int v_6 = 0; v_6 < 112; v_6++) {
[DEBUG kernel.cpp:2492] for (int v_7 = 0; v_7 < 112; v_7++) {
[DEBUG kernel.cpp:2492] aten_batch_norm[v_4, v_5, v_6, v_7] = float(double(aten_cat[0, v_5, v_6, v_7]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5])) + (double(const_self.blocks.1.0.bn1.running_mean[v_5]) - double(const_self.blocks.1.0.bn1.running_mean[v_5]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5]))));
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2492] }
[DEBUG kernel.cpp:2514] after inline{
[DEBUG kernel.cpp:2514] for (int v = 0; v < 1; v++) {
[DEBUG kernel.cpp:2514] for (int v_1 = 0; v_1 < 144; v_1++) {
[DEBUG kernel.cpp:2514] for (int v_2 = 0; v_2 < 112; v_2++) {
[DEBUG kernel.cpp:2514] for (int v_3 = 0; v_3 < 112; v_3++) {
[DEBUG kernel.cpp:2514] aten_cat[v, v_1, v_2, v_3] = IfThenElse(v_1<72 ? 1 : 0, t_[0, v_1, v_2, v_3], t__[0, v_1 - 72, v_2, v_3]);
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] for (int v_4 = 0; v_4 < 1; v_4++) {
[DEBUG kernel.cpp:2514] for (int v_5 = 0; v_5 < 144; v_5++) {
[DEBUG kernel.cpp:2514] for (int v_6 = 0; v_6 < 112; v_6++) {
[DEBUG kernel.cpp:2514] for (int v_7 = 0; v_7 < 112; v_7++) {
[DEBUG kernel.cpp:2514] aten_batch_norm[v_4, v_5, v_6, v_7] = float(double(IfThenElse(v_5<72 ? 1 : 0, t_[0, v_5, v_6, v_7], t__[0, v_5 - 72, v_6, v_7])) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5])) + (double(const_self.blocks.1.0.bn1.running_mean[v_5]) - double(const_self.blocks.1.0.bn1.running_mean[v_5]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[v_5]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[v_5]))));
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2514] }
[DEBUG kernel.cpp:2613] Final Stmt:
[DEBUG kernel.cpp:2613] {
[DEBUG kernel.cpp:2613] for (int _flat_outer = 0; _flat_outer < 3528; _flat_outer++) /* blockIdx.x */{
[DEBUG kernel.cpp:2613] for (int _flat_inner = 0; _flat_inner < 512; _flat_inner++) /* threadIdx.x */{
[DEBUG kernel.cpp:2613] aten_cat[_flat_inner + 512 * _flat_outer] = IfThenElse(((_flat_inner + 512 * _flat_outer) / 12544) % 144<72 ? 1 : 0, t_[(_flat_inner + 512 * _flat_outer) % 1806336], t__[(_flat_inner + 512 * _flat_outer) % 1806336 - 903168]);
[DEBUG kernel.cpp:2613] }
[DEBUG kernel.cpp:2613] }
[DEBUG kernel.cpp:2613] for (int _flat_outer_1 = 0; _flat_outer_1 < 3528; _flat_outer_1++) /* blockIdx.x */{
[DEBUG kernel.cpp:2613] for (int _flat_inner_1 = 0; _flat_inner_1 < 512; _flat_inner_1++) /* threadIdx.x */{
[DEBUG kernel.cpp:2613] aten_batch_norm[_flat_inner_1 + 512 * _flat_outer_1] = float(double(IfThenElse(((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144<72 ? 1 : 0, t_[(_flat_inner_1 + 512 * _flat_outer_1) % 1806336], t__[(_flat_inner_1 + 512 * _flat_outer_1) % 1806336 - 903168])) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144])) + (double(const_self.blocks.1.0.bn1.running_mean[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) - double(const_self.blocks.1.0.bn1.running_mean[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) * ((rsqrt(double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]) + 1e-05)) * double(const_self.blocks.1.0.bn1.running_var[((_flat_inner_1 + 512 * _flat_outer_1) / 12544) % 144]))));
[DEBUG kernel.cpp:2613] }
[DEBUG kernel.cpp:2613] }
[DEBUG kernel.cpp:2613] }
[DEBUG cuda_codegen.cpp:1056] Fused TE CUDA kernel:
[DEBUG cuda_codegen.cpp:1056]
[DEBUG cuda_codegen.cpp:1056] #define NAN __int_as_float(0x7fffffff)
[DEBUG cuda_codegen.cpp:1056] #define POS_INFINITY __int_as_float(0x7f800000)
[DEBUG cuda_codegen.cpp:1056] #define NEG_INFINITY __int_as_float(0xff800000)
[DEBUG cuda_codegen.cpp:1056]
[DEBUG cuda_codegen.cpp:1056]
[DEBUG cuda_codegen.cpp:1056] template<typename T>
[DEBUG cuda_codegen.cpp:1056] __device__ T maximum(T a, T b) {
[DEBUG cuda_codegen.cpp:1056] return isnan(a) ? a : (a > b ? a : b);
[DEBUG cuda_codegen.cpp:1056] }
[DEBUG cuda_codegen.cpp:1056]
[DEBUG cuda_codegen.cpp:1056] template<typename T>
[DEBUG cuda_codegen.cpp:1056] __device__ T minimum(T a, T b) {
[DEBUG cuda_codegen.cpp:1056] return isnan(a) ? a : (a < b ? a : b);
[DEBUG cuda_codegen.cpp:1056] }
[DEBUG cuda_codegen.cpp:1056]
[DEBUG cuda_codegen.cpp:1056] extern "C" __global__
[DEBUG cuda_codegen.cpp:1056] void fused_cat_batch_norm(float* t_, float* t__, float* aten_batch_norm, float* aten_cat, float* const_self.blocks.1.0.bn1.running_mean, float* const_self.blocks.1.0.bn1.running_var) {
[DEBUG cuda_codegen.cpp:1056] {
[DEBUG cuda_codegen.cpp:1056] aten_cat[threadIdx.x + 512 * blockIdx.x] = ((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168));
[DEBUG cuda_codegen.cpp:1056] float const_self.blocks.1.0.bn1.running_var_1 = __ldg(const_self.blocks.1.0.bn1.running_var + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144);
[DEBUG cuda_codegen.cpp:1056] float const_self.blocks.1.0.bn1.running_mean_1 = __ldg(const_self.blocks.1.0.bn1.running_mean + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144);
[DEBUG cuda_codegen.cpp:1056] aten_batch_norm[threadIdx.x + 512 * blockIdx.x] = (float)((double)(((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168))) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1)) + ((double)(const_self.blocks.1.0.bn1.running_mean_1) - (double)(const_self.blocks.1.0.bn1.running_mean_1) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1))));
[DEBUG cuda_codegen.cpp:1056] }
[DEBUG cuda_codegen.cpp:1056] }
[DEBUG cuda_codegen.cpp:1056] gpu_block_extents: (3528, 1, 1)
[DEBUG cuda_codegen.cpp:1056] gpu_thread_extents: (512, 1, 1)
Begin 0
Begin 1
Traceback (most recent call last):
File "debug.py", line 13, in <module>
jit_model(x)
File "/home/ec2-user/anaconda3/envs/robieta_debug_env/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1065, in _call_impl
return forward_call(*input, **kwargs)
RuntimeError: default_program(18): error: expected a ")"
default_program(18): error: extra text after expected end of number
default_program(18): error: extra text after expected end of number
default_program(21): error: expected a ";"
default_program(21): error: extra text after expected end of number
default_program(21): error: extra text after expected end of number
default_program(22): error: "const_self" has already been declared in the current scope
default_program(22): error: expected a ";"
default_program(22): error: extra text after expected end of number
default_program(22): error: extra text after expected end of number
default_program(23): error: extra text after expected end of number
default_program(23): error: expression must have class type
default_program(23): error: expected a ")"
default_program(23): error: extra text after expected end of number
default_program(23): error: expression must have class type
default_program(23): error: expected a ")"
default_program(23): error: extra text after expected end of number
default_program(23): error: expression must have class type
default_program(23): error: expected a ")"
default_program(23): error: extra text after expected end of number
default_program(23): error: expression must have class type
default_program(23): error: expected a ")"
default_program(23): error: extra text after expected end of number
default_program(23): error: expression must have class type
default_program(23): error: expected a ")"
default_program(23): error: extra text after expected end of number
default_program(23): error: expression must have class type
default_program(23): error: expected a ")"
default_program(21): warning: variable "const_self" was declared but never referenced
28 errors detected in the compilation of "default_program".
nvrtc compilation failed:
#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)
template<typename T>
__device__ T maximum(T a, T b) {
return isnan(a) ? a : (a > b ? a : b);
}
template<typename T>
__device__ T minimum(T a, T b) {
return isnan(a) ? a : (a < b ? a : b);
}
extern "C" __global__
void fused_cat_batch_norm(float* t_, float* t__, float* aten_batch_norm, float* aten_cat, float* const_self.blocks.1.0.bn1.running_mean, float* const_self.blocks.1.0.bn1.running_var) {
{
aten_cat[threadIdx.x + 512 * blockIdx.x] = ((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168));
float const_self.blocks.1.0.bn1.running_var_1 = __ldg(const_self.blocks.1.0.bn1.running_var + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144);
float const_self.blocks.1.0.bn1.running_mean_1 = __ldg(const_self.blocks.1.0.bn1.running_mean + ((threadIdx.x + 512 * blockIdx.x) / 12544) % 144);
aten_batch_norm[threadIdx.x + 512 * blockIdx.x] = (float)((double)(((((threadIdx.x + 512 * blockIdx.x) / 12544) % 144<72 ? 1 : 0) ? __ldg(t_ + (threadIdx.x + 512 * blockIdx.x) % 1806336) : __ldg(t__ + (threadIdx.x + 512 * blockIdx.x) % 1806336 - 903168))) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1)) + ((double)(const_self.blocks.1.0.bn1.running_mean_1) - (double)(const_self.blocks.1.0.bn1.running_mean_1) * ((rsqrt((double)(const_self.blocks.1.0.bn1.running_var_1) + 1e-05)) * (double)(const_self.blocks.1.0.bn1.running_var_1))));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment