soumith/out.log

## out.log
[WARNING]: No mapping options supplied. 'Naive' options will be used which might fail compilation
[WARNING]: Autotuning results won't be cached. 'cache' option is not specified
[WARNING]: Using naive options for autotuning

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (2, 1)/10   Time (us): best: 13146 median: 13146 worst: 13146
Generation 0	Job[Compiled, GPU] (2, 1)/10   Time (us): best: 13146 median: 13146 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[4][20][10][11];
  for (int c1 = 0; c1 <= 31; c1 += 20) {
    __syncthreads();
    for (int c3 = 0; c3 <= 3; c3 += 1) {
      for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
        for (int c5 = t1; c5 <= 9; c5 += 5) {
          for (int c6 = t0; c6 <= 9; c6 += 4) {
            _output_0[c3][c4][c5][c6] = output[c3][c1 + c4][c5][c6];
          }
        }
      }
    }
    __syncthreads();
    for (int c3 = 0; c3 <= 9; c3 += 8) {
      for (int c4 = 0; c4 <= 3; c4 += 1) {
        for (int c5 = 0; c5 <= min(19, -c1 + 31); c5 += 1) {
          for (int c6 = 0; c6 <= 9; c6 += 1) {
            for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
              for (int c8 = 0; c8 <= 3; c8 += 1) {
                if (t1 == 0 && t2 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                  _output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
                }
                __syncthreads();
              }
            }
          }
        }
      }
    }
    __syncthreads();
    for (int c3 = 0; c3 <= 3; c3 += 1) {
      for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
        for (int c5 = t1; c5 <= 9; c5 += 5) {
          for (int c6 = t0; c6 <= 9; c6 += 4) {
            output[c3][c1 + c4][c5][c6] = _output_0[c3][c4][c5][c6];
          }
        }
      }
    }
    __syncthreads();
  }
}
}

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (3, 2)/10   Time (us): best: 7716 median: 13146 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (5, 4)/10   Time (us): best: 7716 median: 11094 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c1 = 0; c1 <= 31; c1 += 16) {
    for (int c2 = 0; c2 <= 9; c2 += 4) {
      for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) {
        for (int c5 = c1; c5 <= c1 + 15; c5 += 1) {
          for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) {
            for (int c7 = 0; c7 <= 9; c7 += 1) {
              for (int c8 = 0; c8 <= 3; c8 += 1) {
                if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                  output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
                }
                __syncthreads();
              }
            }
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (6, 5)/10   Time (us): best: 5871 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[2][32][10][11];
  __syncthreads();
  for (int c3 = 0; c3 <= 1; c3 += 1) {
    for (int c4 = 0; c4 <= 31; c4 += 1) {
      _output_0[c3][c4][t1][t0] = output[2*b0 + c3][c4][t1][t0];
    }
  }
  __syncthreads();
  for (int c4 = 0; c4 <= 1; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              _output_0[c4][c5][c6][c7] = (_output_0[c4][c5][c6][c7] + input[2*b0 + c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= 1; c3 += 1) {
    for (int c4 = 0; c4 <= 31; c4 += 1) {
      output[2*b0 + c3][c4][t1][t0] = _output_0[c3][c4][t1][t0];
    }
  }
  __syncthreads();
}
}

Generation 0	Job[Compiled, GPU] (6, 5)/10   Time (us): best: 5871 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c1 = 0; c1 <= 31; c1 += 4) {
    for (int c2 = 0; c2 <= 9; c2 += 1) {
      for (int c4 = 0; c4 <= 3; c4 += 1) {
        for (int c5 = 0; c5 <= 3; c5 += 1) {
          for (int c7 = 0; c7 <= 9; c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                output[c4][c1 + c5][c2][c7] = (output[c4][c1 + c5][c2][c7] + input[c4][c1 + c5][t0 + 2*c2][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (8, 7)/10   Time (us): best: 3472 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c2 = 0; c2 <= 9; c2 += 5) {
    for (int c3 = 0; c3 <= 9; c3 += 2) {
      for (int c4 = 0; c4 <= 3; c4 += 1) {
        for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
          for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
            for (int c7 = c3; c7 <= c3 + 1; c7 += 1) {
              for (int c8 = 0; c8 <= 3; c8 += 1) {
                if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                  output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
                }
              }
            }
          }
        }
      }
    }
  }
}
}

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c3 = 0; c3 <= 9; c3 += 8) {
    for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) {
      for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) {
        for (int c6 = 0; c6 <= 4; c6 += 1) {
          for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                output[3*b0 + c4][20*b1 + c5][5*b2 + c6][c3 + c7] = (output[3*b0 + c4][20*b1 + c5][5*b2 + c6][c3 + c7] + input[3*b0 + c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
              }
            }
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (9, 8)/10   Time (us): best: 3472 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c2 = 0; c2 <= 9; c2 += 5) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      for (int c5 = 0; c5 <= 31; c5 += 1) {
        for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
          for (int c7 = 0; c7 <= 9; c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
}
}

Generation 0	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 2044 median: 7716 worst: 13146
Generation 0	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 2044 median: 7716 worst: 13146

Generation 1	Job[Compiled, GPU] (3, 2)/10   Time (us): best: 2042 median: 3471 worst: 3471
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c2 = 0; c2 <= 9; c2 += 5) {
    for (int c3 = 0; c3 <= 9; c3 += 2) {
      for (int c4 = 0; c4 <= 3; c4 += 1) {
        for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
          for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
            for (int c7 = c3; c7 <= c3 + 1; c7 += 1) {
              for (int c8 = 0; c8 <= 3; c8 += 1) {
                if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                  output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
                }
                __syncthreads();
              }
            }
          }
        }
      }
    }
  }
}
}

Generation 1	Job[Compiled, GPU] (5, 4)/10   Time (us): best: 2042 median: 3471 worst: 6026
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
          }
        }
      }
    }
  }
}
}

Generation 1	Job[Compiled, GPU] (7, 6)/10   Time (us): best: 2042 median: 6026 worst: 7900
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c1 = 4 * b1; c1 <= 31; c1 += 8) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      for (int c5 = 0; c5 <= 3; c5 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c1 + c5][b2][c7] = (output[c4][c1 + c5][b2][c7] + input[c4][c1 + c5][t0 + 2*b2][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

Generation 1	Job[Compiled, GPU] (8, 7)/10   Time (us): best: 2042 median: 6026 worst: 7900
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= min(2, -3 * b1 + 31); c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][3*b1 + c5][c6][c7] = (output[c4][3*b1 + c5][c6][c7] + input[c4][3*b1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

Generation 1	Job[Compiled, GPU] (10, 9)/10   Time (us): best: 606 median: 5872 worst: 7900
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[4][20][5][11];
  __syncthreads();
  for (int c3 = 0; c3 <= 3; c3 += 1) {
    for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 4; c5 += 1) {
        _output_0[c3][c4][c5][t0] = output[c3][20*b1 + c4][5*b2 + c5][t0];
      }
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= 9; c3 += 8) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) {
        for (int c6 = 0; c6 <= 4; c6 += 1) {
          for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                _output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
              }
            }
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= 3; c3 += 1) {
    for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 4; c5 += 1) {
        output[c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
      }
    }
  }
  __syncthreads();
}
}

Generation 1	Job[Compiled, GPU] (10, 9)/10   Time (us): best: 606 median: 5872 worst: 7900
Generation 1	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 606 median: 3471 worst: 7900
Generation 1	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 606 median: 3471 worst: 7900

Generation 2	Job[Compiled, GPU] (2, 1)/10   Time (us): best: 604 median: 604 worst: 604
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[4][20][5][11];
  __syncthreads();
  for (int c3 = 0; c3 <= 3; c3 += 1) {
    for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 4; c5 += 1) {
        _output_0[c3][c4][c5][t0] = output[c3][20*b1 + c4][5*b2 + c5][t0];
      }
    }
  }
  __syncthreads();
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
      for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              _output_0[c4][-20*b1 + c5][-5*b2 + c6][c7] = (_output_0[c4][-20*b1 + c5][-5*b2 + c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= 3; c3 += 1) {
    for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 4; c5 += 1) {
        output[c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
      }
    }
  }
  __syncthreads();
}
}

Generation 2	Job[Compiled, GPU] (3, 2)/10   Time (us): best: 604 median: 604 worst: 604
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 0; c5 <= min(2, -3 * b1 + 31); c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][3*b1 + c5][c6][c7] = (output[c4][3*b1 + c5][c6][c7] + input[c4][3*b1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
}
}

Generation 2	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 604 median: 704 worst: 1052
Generation 2	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 604 median: 704 worst: 1052
Generation 2	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 604 median: 704 worst: 1052
Generation 2	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 604 median: 704 worst: 1052
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[3][20][5][11];
  __syncthreads();
  for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
    _output_0[c3][0][0][t0] = output[3*b0 + c3][20*b1][5*b2][t0];
    _output_0[c3][0][1][t0] = output[3*b0 + c3][20*b1][1 + 5*b2][t0];
    _output_0[c3][0][2][t0] = output[3*b0 + c3][20*b1][2 + 5*b2][t0];
    _output_0[c3][0][3][t0] = output[3*b0 + c3][20*b1][3 + 5*b2][t0];
    _output_0[c3][0][4][t0] = output[3*b0 + c3][20*b1][4 + 5*b2][t0];
    _output_0[c3][1][0][t0] = output[3*b0 + c3][1 + 20*b1][5*b2][t0];
    _output_0[c3][1][1][t0] = output[3*b0 + c3][1 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][1][2][t0] = output[3*b0 + c3][1 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][1][3][t0] = output[3*b0 + c3][1 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][1][4][t0] = output[3*b0 + c3][1 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][2][0][t0] = output[3*b0 + c3][2 + 20*b1][5*b2][t0];
    _output_0[c3][2][1][t0] = output[3*b0 + c3][2 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][2][2][t0] = output[3*b0 + c3][2 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][2][3][t0] = output[3*b0 + c3][2 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][2][4][t0] = output[3*b0 + c3][2 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][3][0][t0] = output[3*b0 + c3][3 + 20*b1][5*b2][t0];
    _output_0[c3][3][1][t0] = output[3*b0 + c3][3 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][3][2][t0] = output[3*b0 + c3][3 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][3][3][t0] = output[3*b0 + c3][3 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][3][4][t0] = output[3*b0 + c3][3 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][4][0][t0] = output[3*b0 + c3][4 + 20*b1][5*b2][t0];
    _output_0[c3][4][1][t0] = output[3*b0 + c3][4 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][4][2][t0] = output[3*b0 + c3][4 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][4][3][t0] = output[3*b0 + c3][4 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][4][4][t0] = output[3*b0 + c3][4 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][5][0][t0] = output[3*b0 + c3][5 + 20*b1][5*b2][t0];
    _output_0[c3][5][1][t0] = output[3*b0 + c3][5 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][5][2][t0] = output[3*b0 + c3][5 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][5][3][t0] = output[3*b0 + c3][5 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][5][4][t0] = output[3*b0 + c3][5 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][6][0][t0] = output[3*b0 + c3][6 + 20*b1][5*b2][t0];
    _output_0[c3][6][1][t0] = output[3*b0 + c3][6 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][6][2][t0] = output[3*b0 + c3][6 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][6][3][t0] = output[3*b0 + c3][6 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][6][4][t0] = output[3*b0 + c3][6 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][7][0][t0] = output[3*b0 + c3][7 + 20*b1][5*b2][t0];
    _output_0[c3][7][1][t0] = output[3*b0 + c3][7 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][7][2][t0] = output[3*b0 + c3][7 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][7][3][t0] = output[3*b0 + c3][7 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][7][4][t0] = output[3*b0 + c3][7 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][8][0][t0] = output[3*b0 + c3][8 + 20*b1][5*b2][t0];
    _output_0[c3][8][1][t0] = output[3*b0 + c3][8 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][8][2][t0] = output[3*b0 + c3][8 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][8][3][t0] = output[3*b0 + c3][8 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][8][4][t0] = output[3*b0 + c3][8 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][9][0][t0] = output[3*b0 + c3][9 + 20*b1][5*b2][t0];
    _output_0[c3][9][1][t0] = output[3*b0 + c3][9 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][9][2][t0] = output[3*b0 + c3][9 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][9][3][t0] = output[3*b0 + c3][9 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][9][4][t0] = output[3*b0 + c3][9 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][10][0][t0] = output[3*b0 + c3][10 + 20*b1][5*b2][t0];
    _output_0[c3][10][1][t0] = output[3*b0 + c3][10 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][10][2][t0] = output[3*b0 + c3][10 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][10][3][t0] = output[3*b0 + c3][10 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][10][4][t0] = output[3*b0 + c3][10 + 20*b1][4 + 5*b2][t0];
    _output_0[c3][11][0][t0] = output[3*b0 + c3][11 + 20*b1][5*b2][t0];
    _output_0[c3][11][1][t0] = output[3*b0 + c3][11 + 20*b1][1 + 5*b2][t0];
    _output_0[c3][11][2][t0] = output[3*b0 + c3][11 + 20*b1][2 + 5*b2][t0];
    _output_0[c3][11][3][t0] = output[3*b0 + c3][11 + 20*b1][3 + 5*b2][t0];
    _output_0[c3][11][4][t0] = output[3*b0 + c3][11 + 20*b1][4 + 5*b2][t0];
    if (b1 == 0) {
      _output_0[c3][12][0][t0] = output[3*b0 + c3][12][5*b2][t0];
      _output_0[c3][12][1][t0] = output[3*b0 + c3][12][1 + 5*b2][t0];
      _output_0[c3][12][2][t0] = output[3*b0 + c3][12][2 + 5*b2][t0];
      _output_0[c3][12][3][t0] = output[3*b0 + c3][12][3 + 5*b2][t0];
      _output_0[c3][12][4][t0] = output[3*b0 + c3][12][4 + 5*b2][t0];
      _output_0[c3][13][0][t0] = output[3*b0 + c3][13][5*b2][t0];
      _output_0[c3][13][1][t0] = output[3*b0 + c3][13][1 + 5*b2][t0];
      _output_0[c3][13][2][t0] = output[3*b0 + c3][13][2 + 5*b2][t0];
      _output_0[c3][13][3][t0] = output[3*b0 + c3][13][3 + 5*b2][t0];
      _output_0[c3][13][4][t0] = output[3*b0 + c3][13][4 + 5*b2][t0];
      _output_0[c3][14][0][t0] = output[3*b0 + c3][14][5*b2][t0];
      _output_0[c3][14][1][t0] = output[3*b0 + c3][14][1 + 5*b2][t0];
      _output_0[c3][14][2][t0] = output[3*b0 + c3][14][2 + 5*b2][t0];
      _output_0[c3][14][3][t0] = output[3*b0 + c3][14][3 + 5*b2][t0];
      _output_0[c3][14][4][t0] = output[3*b0 + c3][14][4 + 5*b2][t0];
      _output_0[c3][15][0][t0] = output[3*b0 + c3][15][5*b2][t0];
      _output_0[c3][15][1][t0] = output[3*b0 + c3][15][1 + 5*b2][t0];
      _output_0[c3][15][2][t0] = output[3*b0 + c3][15][2 + 5*b2][t0];
      _output_0[c3][15][3][t0] = output[3*b0 + c3][15][3 + 5*b2][t0];
      _output_0[c3][15][4][t0] = output[3*b0 + c3][15][4 + 5*b2][t0];
      _output_0[c3][16][0][t0] = output[3*b0 + c3][16][5*b2][t0];
      _output_0[c3][16][1][t0] = output[3*b0 + c3][16][1 + 5*b2][t0];
      _output_0[c3][16][2][t0] = output[3*b0 + c3][16][2 + 5*b2][t0];
      _output_0[c3][16][3][t0] = output[3*b0 + c3][16][3 + 5*b2][t0];
      _output_0[c3][16][4][t0] = output[3*b0 + c3][16][4 + 5*b2][t0];
      _output_0[c3][17][0][t0] = output[3*b0 + c3][17][5*b2][t0];
      _output_0[c3][17][1][t0] = output[3*b0 + c3][17][1 + 5*b2][t0];
      _output_0[c3][17][2][t0] = output[3*b0 + c3][17][2 + 5*b2][t0];
      _output_0[c3][17][3][t0] = output[3*b0 + c3][17][3 + 5*b2][t0];
      _output_0[c3][17][4][t0] = output[3*b0 + c3][17][4 + 5*b2][t0];
      _output_0[c3][18][0][t0] = output[3*b0 + c3][18][5*b2][t0];
      _output_0[c3][18][1][t0] = output[3*b0 + c3][18][1 + 5*b2][t0];
      _output_0[c3][18][2][t0] = output[3*b0 + c3][18][2 + 5*b2][t0];
      _output_0[c3][18][3][t0] = output[3*b0 + c3][18][3 + 5*b2][t0];
      _output_0[c3][18][4][t0] = output[3*b0 + c3][18][4 + 5*b2][t0];
      _output_0[c3][19][0][t0] = output[3*b0 + c3][19][5*b2][t0];
      _output_0[c3][19][1][t0] = output[3*b0 + c3][19][1 + 5*b2][t0];
      _output_0[c3][19][2][t0] = output[3*b0 + c3][19][2 + 5*b2][t0];
      _output_0[c3][19][3][t0] = output[3*b0 + c3][19][3 + 5*b2][t0];
      _output_0[c3][19][4][t0] = output[3*b0 + c3][19][4 + 5*b2][t0];
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= 9; c3 += 8) {
    for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) {
      for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) {
        for (int c6 = 0; c6 <= 4; c6 += 1) {
          for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                _output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[3*b0 + c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
              }
            }
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
    output[3*b0 + c3][20*b1][5*b2][t0] = _output_0[c3][0][0][t0];
    output[3*b0 + c3][20*b1][1 + 5*b2][t0] = _output_0[c3][0][1][t0];
    output[3*b0 + c3][20*b1][2 + 5*b2][t0] = _output_0[c3][0][2][t0];
    output[3*b0 + c3][20*b1][3 + 5*b2][t0] = _output_0[c3][0][3][t0];
    output[3*b0 + c3][20*b1][4 + 5*b2][t0] = _output_0[c3][0][4][t0];
    output[3*b0 + c3][1 + 20*b1][5*b2][t0] = _output_0[c3][1][0][t0];
    output[3*b0 + c3][1 + 20*b1][1 + 5*b2][t0] = _output_0[c3][1][1][t0];
    output[3*b0 + c3][1 + 20*b1][2 + 5*b2][t0] = _output_0[c3][1][2][t0];
    output[3*b0 + c3][1 + 20*b1][3 + 5*b2][t0] = _output_0[c3][1][3][t0];
    output[3*b0 + c3][1 + 20*b1][4 + 5*b2][t0] = _output_0[c3][1][4][t0];
    output[3*b0 + c3][2 + 20*b1][5*b2][t0] = _output_0[c3][2][0][t0];
    output[3*b0 + c3][2 + 20*b1][1 + 5*b2][t0] = _output_0[c3][2][1][t0];
    output[3*b0 + c3][2 + 20*b1][2 + 5*b2][t0] = _output_0[c3][2][2][t0];
    output[3*b0 + c3][2 + 20*b1][3 + 5*b2][t0] = _output_0[c3][2][3][t0];
    output[3*b0 + c3][2 + 20*b1][4 + 5*b2][t0] = _output_0[c3][2][4][t0];
    output[3*b0 + c3][3 + 20*b1][5*b2][t0] = _output_0[c3][3][0][t0];
    output[3*b0 + c3][3 + 20*b1][1 + 5*b2][t0] = _output_0[c3][3][1][t0];
    output[3*b0 + c3][3 + 20*b1][2 + 5*b2][t0] = _output_0[c3][3][2][t0];
    output[3*b0 + c3][3 + 20*b1][3 + 5*b2][t0] = _output_0[c3][3][3][t0];
    output[3*b0 + c3][3 + 20*b1][4 + 5*b2][t0] = _output_0[c3][3][4][t0];
    output[3*b0 + c3][4 + 20*b1][5*b2][t0] = _output_0[c3][4][0][t0];
    output[3*b0 + c3][4 + 20*b1][1 + 5*b2][t0] = _output_0[c3][4][1][t0];
    output[3*b0 + c3][4 + 20*b1][2 + 5*b2][t0] = _output_0[c3][4][2][t0];
    output[3*b0 + c3][4 + 20*b1][3 + 5*b2][t0] = _output_0[c3][4][3][t0];
    output[3*b0 + c3][4 + 20*b1][4 + 5*b2][t0] = _output_0[c3][4][4][t0];
    output[3*b0 + c3][5 + 20*b1][5*b2][t0] = _output_0[c3][5][0][t0];
    output[3*b0 + c3][5 + 20*b1][1 + 5*b2][t0] = _output_0[c3][5][1][t0];
    output[3*b0 + c3][5 + 20*b1][2 + 5*b2][t0] = _output_0[c3][5][2][t0];
    output[3*b0 + c3][5 + 20*b1][3 + 5*b2][t0] = _output_0[c3][5][3][t0];
    output[3*b0 + c3][5 + 20*b1][4 + 5*b2][t0] = _output_0[c3][5][4][t0];
    output[3*b0 + c3][6 + 20*b1][5*b2][t0] = _output_0[c3][6][0][t0];
    output[3*b0 + c3][6 + 20*b1][1 + 5*b2][t0] = _output_0[c3][6][1][t0];
    output[3*b0 + c3][6 + 20*b1][2 + 5*b2][t0] = _output_0[c3][6][2][t0];
    output[3*b0 + c3][6 + 20*b1][3 + 5*b2][t0] = _output_0[c3][6][3][t0];
    output[3*b0 + c3][6 + 20*b1][4 + 5*b2][t0] = _output_0[c3][6][4][t0];
    output[3*b0 + c3][7 + 20*b1][5*b2][t0] = _output_0[c3][7][0][t0];
    output[3*b0 + c3][7 + 20*b1][1 + 5*b2][t0] = _output_0[c3][7][1][t0];
    output[3*b0 + c3][7 + 20*b1][2 + 5*b2][t0] = _output_0[c3][7][2][t0];
    output[3*b0 + c3][7 + 20*b1][3 + 5*b2][t0] = _output_0[c3][7][3][t0];
    output[3*b0 + c3][7 + 20*b1][4 + 5*b2][t0] = _output_0[c3][7][4][t0];
    output[3*b0 + c3][8 + 20*b1][5*b2][t0] = _output_0[c3][8][0][t0];
    output[3*b0 + c3][8 + 20*b1][1 + 5*b2][t0] = _output_0[c3][8][1][t0];
    output[3*b0 + c3][8 + 20*b1][2 + 5*b2][t0] = _output_0[c3][8][2][t0];
    output[3*b0 + c3][8 + 20*b1][3 + 5*b2][t0] = _output_0[c3][8][3][t0];
    output[3*b0 + c3][8 + 20*b1][4 + 5*b2][t0] = _output_0[c3][8][4][t0];
    output[3*b0 + c3][9 + 20*b1][5*b2][t0] = _output_0[c3][9][0][t0];
    output[3*b0 + c3][9 + 20*b1][1 + 5*b2][t0] = _output_0[c3][9][1][t0];
    output[3*b0 + c3][9 + 20*b1][2 + 5*b2][t0] = _output_0[c3][9][2][t0];
    output[3*b0 + c3][9 + 20*b1][3 + 5*b2][t0] = _output_0[c3][9][3][t0];
    output[3*b0 + c3][9 + 20*b1][4 + 5*b2][t0] = _output_0[c3][9][4][t0];
    output[3*b0 + c3][10 + 20*b1][5*b2][t0] = _output_0[c3][10][0][t0];
    output[3*b0 + c3][10 + 20*b1][1 + 5*b2][t0] = _output_0[c3][10][1][t0];
    output[3*b0 + c3][10 + 20*b1][2 + 5*b2][t0] = _output_0[c3][10][2][t0];
    output[3*b0 + c3][10 + 20*b1][3 + 5*b2][t0] = _output_0[c3][10][3][t0];
    output[3*b0 + c3][10 + 20*b1][4 + 5*b2][t0] = _output_0[c3][10][4][t0];
    output[3*b0 + c3][11 + 20*b1][5*b2][t0] = _output_0[c3][11][0][t0];
    output[3*b0 + c3][11 + 20*b1][1 + 5*b2][t0] = _output_0[c3][11][1][t0];
    output[3*b0 + c3][11 + 20*b1][2 + 5*b2][t0] = _output_0[c3][11][2][t0];
    output[3*b0 + c3][11 + 20*b1][3 + 5*b2][t0] = _output_0[c3][11][3][t0];
    output[3*b0 + c3][11 + 20*b1][4 + 5*b2][t0] = _output_0[c3][11][4][t0];
    if (b1 == 0) {
      output[3*b0 + c3][12][5*b2][t0] = _output_0[c3][12][0][t0];
      output[3*b0 + c3][12][1 + 5*b2][t0] = _output_0[c3][12][1][t0];
      output[3*b0 + c3][12][2 + 5*b2][t0] = _output_0[c3][12][2][t0];
      output[3*b0 + c3][12][3 + 5*b2][t0] = _output_0[c3][12][3][t0];
      output[3*b0 + c3][12][4 + 5*b2][t0] = _output_0[c3][12][4][t0];
      output[3*b0 + c3][13][5*b2][t0] = _output_0[c3][13][0][t0];
      output[3*b0 + c3][13][1 + 5*b2][t0] = _output_0[c3][13][1][t0];
      output[3*b0 + c3][13][2 + 5*b2][t0] = _output_0[c3][13][2][t0];
      output[3*b0 + c3][13][3 + 5*b2][t0] = _output_0[c3][13][3][t0];
      output[3*b0 + c3][13][4 + 5*b2][t0] = _output_0[c3][13][4][t0];
      output[3*b0 + c3][14][5*b2][t0] = _output_0[c3][14][0][t0];
      output[3*b0 + c3][14][1 + 5*b2][t0] = _output_0[c3][14][1][t0];
      output[3*b0 + c3][14][2 + 5*b2][t0] = _output_0[c3][14][2][t0];
      output[3*b0 + c3][14][3 + 5*b2][t0] = _output_0[c3][14][3][t0];
      output[3*b0 + c3][14][4 + 5*b2][t0] = _output_0[c3][14][4][t0];
      output[3*b0 + c3][15][5*b2][t0] = _output_0[c3][15][0][t0];
      output[3*b0 + c3][15][1 + 5*b2][t0] = _output_0[c3][15][1][t0];
      output[3*b0 + c3][15][2 + 5*b2][t0] = _output_0[c3][15][2][t0];
      output[3*b0 + c3][15][3 + 5*b2][t0] = _output_0[c3][15][3][t0];
      output[3*b0 + c3][15][4 + 5*b2][t0] = _output_0[c3][15][4][t0];
      output[3*b0 + c3][16][5*b2][t0] = _output_0[c3][16][0][t0];
      output[3*b0 + c3][16][1 + 5*b2][t0] = _output_0[c3][16][1][t0];
      output[3*b0 + c3][16][2 + 5*b2][t0] = _output_0[c3][16][2][t0];
      output[3*b0 + c3][16][3 + 5*b2][t0] = _output_0[c3][16][3][t0];
      output[3*b0 + c3][16][4 + 5*b2][t0] = _output_0[c3][16][4][t0];
      output[3*b0 + c3][17][5*b2][t0] = _output_0[c3][17][0][t0];
      output[3*b0 + c3][17][1 + 5*b2][t0] = _output_0[c3][17][1][t0];
      output[3*b0 + c3][17][2 + 5*b2][t0] = _output_0[c3][17][2][t0];
      output[3*b0 + c3][17][3 + 5*b2][t0] = _output_0[c3][17][3][t0];
      output[3*b0 + c3][17][4 + 5*b2][t0] = _output_0[c3][17][4][t0];
      output[3*b0 + c3][18][5*b2][t0] = _output_0[c3][18][0][t0];
      output[3*b0 + c3][18][1 + 5*b2][t0] = _output_0[c3][18][1][t0];
      output[3*b0 + c3][18][2 + 5*b2][t0] = _output_0[c3][18][2][t0];
      output[3*b0 + c3][18][3 + 5*b2][t0] = _output_0[c3][18][3][t0];
      output[3*b0 + c3][18][4 + 5*b2][t0] = _output_0[c3][18][4][t0];
      output[3*b0 + c3][19][5*b2][t0] = _output_0[c3][19][0][t0];
      output[3*b0 + c3][19][1 + 5*b2][t0] = _output_0[c3][19][1][t0];
      output[3*b0 + c3][19][2 + 5*b2][t0] = _output_0[c3][19][2][t0];
      output[3*b0 + c3][19][3 + 5*b2][t0] = _output_0[c3][19][3][t0];
      output[3*b0 + c3][19][4 + 5*b2][t0] = _output_0[c3][19][4][t0];
    }
  }
  __syncthreads();
}
}

Generation 2	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 604 median: 704 worst: 1052
Generation 2	Job[Compiled, GPU] (5, 4)/10   Time (us): best: 604 median: 704 worst: 1052
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[2][32][10][11];
  __syncthreads();
  for (int c2 = 0; c2 <= 1; c2 += 1) {
    for (int c3 = 0; c3 <= 31; c3 += 1) {
      _output_0[c2][c3][t1][t0] = output[2*b0 + c2][c3][t1][t0];
    }
  }
  __syncthreads();
  for (int c4 = 0; c4 <= 1; c4 += 1) {
    for (int c5 = 0; c5 <= 31; c5 += 1) {
      for (int c6 = 0; c6 <= 9; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              _output_0[c4][c5][c6][c7] = (_output_0[c4][c5][c6][c7] + input[2*b0 + c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
            __syncthreads();
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c2 = 0; c2 <= 1; c2 += 1) {
    for (int c3 = 0; c3 <= 31; c3 += 1) {
      output[2*b0 + c2][c3][t1][t0] = _output_0[c2][c3][t1][t0];
    }
  }
  __syncthreads();
}
}

Generation 2	Job[Compiled, GPU] (6, 5)/10   Time (us): best: 604 median: 704 worst: 1052
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[2][16][10][11];
  __syncthreads();
  for (int c2 = 0; c2 <= 1; c2 += 1) {
    for (int c3 = 0; c3 <= 15; c3 += 1) {
      for (int c4 = t1; c4 <= 9; c4 += 3) {
        _output_0[c2][c3][c4][t0] = output[2*b0 + c2][16*b1 + c3][c4][t0];
      }
    }
  }
  __syncthreads();
  for (int c2 = 0; c2 <= 9; c2 += 4) {
    for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) {
      for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
        for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) {
          for (int c7 = 0; c7 <= 9; c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                _output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] = (_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c2 = 0; c2 <= 1; c2 += 1) {
    for (int c3 = 0; c3 <= 15; c3 += 1) {
      for (int c4 = t1; c4 <= 9; c4 += 3) {
        output[2*b0 + c2][16*b1 + c3][c4][t0] = _output_0[c2][c3][c4][t0];
      }
    }
  }
  __syncthreads();
}
}

Generation 2	Job[Compiled, GPU] (6, 5)/10   Time (us): best: 604 median: 704 worst: 1052
Generation 2	Job[Compiled, GPU] (8, 7)/10   Time (us): best: 604 median: 704 worst: 1769
Generation 2	Job[Compiled, GPU] (8, 7)/10   Time (us): best: 604 median: 704 worst: 1769
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[3][20][10][11];
  for (int c1 = 0; c1 <= 31; c1 += 20) {
    __syncthreads();
    for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
      for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
        for (int c5 = t1; c5 <= 9; c5 += 5) {
          for (int c6 = t0; c6 <= 9; c6 += 4) {
            _output_0[c3][c4][c5][c6] = output[3*b0 + c3][c1 + c4][c5][c6];
          }
        }
      }
    }
    __syncthreads();
    for (int c3 = 0; c3 <= 9; c3 += 8) {
      for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) {
        for (int c5 = 0; c5 <= min(19, -c1 + 31); c5 += 1) {
          for (int c6 = 0; c6 <= 9; c6 += 1) {
            for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
              for (int c8 = 0; c8 <= 3; c8 += 1) {
                if (t1 == 0 && t2 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                  _output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[3*b0 + c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
                }
                __syncthreads();
              }
            }
          }
        }
      }
    }
    __syncthreads();
    for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
      for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
        for (int c5 = t1; c5 <= 9; c5 += 5) {
          for (int c6 = t0; c6 <= 9; c6 += 4) {
            output[3*b0 + c3][c1 + c4][c5][c6] = _output_0[c3][c4][c5][c6];
          }
        }
      }
    }
    __syncthreads();
  }
}
}

Generation 2	Job[Compiled, GPU] (10, 9)/10   Time (us): best: 604 median: 704 worst: 1769
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c3 = 0; c3 <= 9; c3 += 8) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      for (int c5 = 3 * b1; c5 <= min(31, 3 * b1 + 2); c5 += 1) {
        for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
          for (int c7 = c3; c7 <= min(9, c3 + 7); c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
}
}

Generation 2	Job[Compiled, GPU] (10, 9)/10   Time (us): best: 604 median: 704 worst: 1769
Generation 2	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 559 median: 704 worst: 1769
Generation 2	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 559 median: 704 worst: 1769

Generation 3	Job[Compiled, GPU] (3, 2)/10   Time (us): best: 558 median: 604 worst: 604
Generation 3	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 558 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[4][5][5][11];
  __shared__ float32 _input_0[4][5][10][21];
  __syncthreads();
  for (int c3 = 0; c3 <= 3; c3 += 1) {
    for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 9; c5 += 1) {
        _input_0[c3][c4][c5][t0] = input[c3][5*b1 + c4][10*b2 + c5][t0];
      }
    }
  }
  if (t0 <= 9) {
    for (int c3 = 0; c3 <= 3; c3 += 1) {
      for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) {
        for (int c5 = 0; c5 <= 4; c5 += 1) {
          _output_0[c3][c4][c5][t0] = output[c3][5*b1 + c4][5*b2 + c5][t0];
        }
      }
    }
  }
  __syncthreads();
  for (int c4 = 0; c4 <= 3; c4 += 1) {
    for (int c5 = 5 * b1; c5 <= min(31, 5 * b1 + 4); c5 += 1) {
      for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              _output_0[c4][-5*b1 + c5][-5*b2 + c6][c7] = (_output_0[c4][-5*b1 + c5][-5*b2 + c6][c7] + _input_0[c4][-5*b1 + c5][t0 - 10*b2 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
          }
        }
      }
    }
  }
  __syncthreads();
  if (t0 <= 9) {
    for (int c3 = 0; c3 <= 3; c3 += 1) {
      for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) {
        for (int c5 = 0; c5 <= 4; c5 += 1) {
          output[c3][5*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
        }
      }
    }
  }
  __syncthreads();
}
}

Generation 3	Job[Compiled, GPU] (4, 3)/10   Time (us): best: 558 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c1 = 0; c1 <= 31; c1 += 3) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      for (int c5 = 0; c5 <= min(2, -c1 + 31); c5 += 1) {
        for (int c6 = 0; c6 <= 9; c6 += 1) {
          for (int c7 = 0; c7 <= 9; c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                output[c4][c1 + c5][c6][c7] = (output[c4][c1 + c5][c6][c7] + input[c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
}
}

Generation 3	Job[Compiled, GPU] (6, 5)/10   Time (us): best: 111 median: 604 worst: 640
Generation 3	Job[Compiled, GPU] (6, 5)/10   Time (us): best: 111 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[2][16][10][11];
  __syncthreads();
  for (int c2 = 0; c2 <= 1; c2 += 1) {
    for (int c3 = 0; c3 <= 15; c3 += 1) {
      for (int c4 = t1; c4 <= 9; c4 += 3) {
        _output_0[c2][c3][c4][t0] = output[2*b0 + c2][16*b1 + c3][c4][t0];
      }
    }
  }
  __syncthreads();
  for (int c2 = 0; c2 <= 9; c2 += 4) {
    for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) {
      for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
        for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) {
          for (int c7 = 0; c7 <= 9; c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                _output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] = (_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c2 = 0; c2 <= 1; c2 += 1) {
    for (int c3 = 0; c3 <= 15; c3 += 1) {
      for (int c4 = t1; c4 <= 9; c4 += 3) {
        output[2*b0 + c2][16*b1 + c3][c4][t0] = _output_0[c2][c3][c4][t0];
      }
    }
  }
  __syncthreads();
}
}

Generation 3	Job[Compiled, GPU] (7, 6)/10   Time (us): best: 111 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  __shared__ float32 _output_0[3][20][5][11];
  __syncthreads();
  for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
    for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 4; c5 += 1) {
        _output_0[c3][c4][c5][t0] = output[3*b0 + c3][20*b1 + c4][5*b2 + c5][t0];
      }
    }
  }
  __syncthreads();
  for (int c4 = 3 * b0; c4 <= min(3, 3 * b0 + 2); c4 += 1) {
    for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
      for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              _output_0[-3*b0 + c4][-20*b1 + c5][-5*b2 + c6][c7] = (_output_0[-3*b0 + c4][-20*b1 + c5][-5*b2 + c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
          }
        }
      }
    }
  }
  __syncthreads();
  for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
    for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
      for (int c5 = 0; c5 <= 4; c5 += 1) {
        output[3*b0 + c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
      }
    }
  }
  __syncthreads();
}
}

Generation 3	Job[Compiled, GPU] (8, 7)/10   Time (us): best: 111 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c4 = 3 * b0; c4 <= min(3, 3 * b0 + 2); c4 += 1) {
    for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
      for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
        for (int c7 = 0; c7 <= 9; c7 += 1) {
          for (int c8 = 0; c8 <= 3; c8 += 1) {
            if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
              output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
            }
          }
        }
      }
    }
  }
}
}

Generation 3	Job[Compiled, GPU] (9, 8)/10   Time (us): best: 111 median: 558 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c1 = 8 * b1; c1 <= 31; c1 += 16) {
    for (int c2 = 0; c2 <= 9; c2 += 4) {
      for (int c4 = 0; c4 <= 3; c4 += 1) {
        for (int c5 = 0; c5 <= 7; c5 += 1) {
          for (int c6 = 0; c6 <= min(3, -c2 + 9); c6 += 1) {
            for (int c7 = 0; c7 <= 9; c7 += 1) {
              for (int c8 = 0; c8 <= 3; c8 += 1) {
                if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                  output[c4][c1 + c5][c2 + c6][c7] = (output[c4][c1 + c5][c2 + c6][c7] + input[c4][c1 + c5][t0 + 2*c2 + 2*c6][-2*t0 + 2*c7 + c8]);
                }
                __syncthreads();
              }
            }
          }
        }
      }
    }
  }
}
}

Generation 3	Job[Compiled, GPU] (10, 9)/10   Time (us): best: 111 median: 558 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;


extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
  float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
  for (int c2 = 0; c2 <= 9; c2 += 5) {
    for (int c4 = 0; c4 <= 3; c4 += 1) {
      for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
        for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
          for (int c7 = 0; c7 <= 9; c7 += 1) {
            for (int c8 = 0; c8 <= 3; c8 += 1) {
              if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
                output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
              }
              __syncthreads();
            }
          }
        }
      }
    }
  }
}
}

Generation 3	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 111 median: 558 worst: 640
Generation 3	Job[Compiled, GPU] (10, 10)/10   Time (us): best: 111 median: 558 worst: 640