Skip to content

Instantly share code, notes, and snippets.

@soumith
Forked from anonymous/out.log
Created February 12, 2018 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soumith/24bce8b153e033ff0f9dbd25c51e07bb to your computer and use it in GitHub Desktop.
Save soumith/24bce8b153e033ff0f9dbd25c51e07bb to your computer and use it in GitHub Desktop.
[WARNING]: No mapping options supplied. 'Naive' options will be used which might fail compilation
[WARNING]: Autotuning results won't be cached. 'cache' option is not specified
[WARNING]: Using naive options for autotuning
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (2, 1)/10 Time (us): best: 13146 median: 13146 worst: 13146 Generation 0 Job[Compiled, GPU] (2, 1)/10 Time (us): best: 13146 median: 13146 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[4][20][10][11];
for (int c1 = 0; c1 <= 31; c1 += 20) {
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
for (int c5 = t1; c5 <= 9; c5 += 5) {
for (int c6 = t0; c6 <= 9; c6 += 4) {
_output_0[c3][c4][c5][c6] = output[c3][c1 + c4][c5][c6];
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 9; c3 += 8) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= min(19, -c1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (t1 == 0 && t2 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
for (int c5 = t1; c5 <= 9; c5 += 5) {
for (int c6 = t0; c6 <= 9; c6 += 4) {
output[c3][c1 + c4][c5][c6] = _output_0[c3][c4][c5][c6];
}
}
}
}
__syncthreads();
}
}
}
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 7716 median: 13146 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (5, 4)/10 Time (us): best: 7716 median: 11094 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c1 = 0; c1 <= 31; c1 += 16) {
for (int c2 = 0; c2 <= 9; c2 += 4) {
for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) {
for (int c5 = c1; c5 <= c1 + 15; c5 += 1) {
for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 5871 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[2][32][10][11];
__syncthreads();
for (int c3 = 0; c3 <= 1; c3 += 1) {
for (int c4 = 0; c4 <= 31; c4 += 1) {
_output_0[c3][c4][t1][t0] = output[2*b0 + c3][c4][t1][t0];
}
}
__syncthreads();
for (int c4 = 0; c4 <= 1; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][c5][c6][c7] = (_output_0[c4][c5][c6][c7] + input[2*b0 + c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 1; c3 += 1) {
for (int c4 = 0; c4 <= 31; c4 += 1) {
output[2*b0 + c3][c4][t1][t0] = _output_0[c3][c4][t1][t0];
}
}
__syncthreads();
}
}
 Generation 0 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 5871 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c1 = 0; c1 <= 31; c1 += 4) {
for (int c2 = 0; c2 <= 9; c2 += 1) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 3; c5 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c1 + c5][c2][c7] = (output[c4][c1 + c5][c2][c7] + input[c4][c1 + c5][t0 + 2*c2][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 3472 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c2 = 0; c2 <= 9; c2 += 5) {
for (int c3 = 0; c3 <= 9; c3 += 2) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
for (int c7 = c3; c7 <= c3 + 1; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
}
}
}
}
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c3 = 0; c3 <= 9; c3 += 8) {
for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) {
for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 4; c6 += 1) {
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[3*b0 + c4][20*b1 + c5][5*b2 + c6][c3 + c7] = (output[3*b0 + c4][20*b1 + c5][5*b2 + c6][c3 + c7] + input[3*b0 + c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
}
}
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (9, 8)/10 Time (us): best: 3472 median: 7900 worst: 13146
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c2 = 0; c2 <= 9; c2 += 5) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
 Generation 0 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 2044 median: 7716 worst: 13146 Generation 0 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 2044 median: 7716 worst: 13146
 Generation 1 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 2042 median: 3471 worst: 3471
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c2 = 0; c2 <= 9; c2 += 5) {
for (int c3 = 0; c3 <= 9; c3 += 2) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
for (int c7 = c3; c7 <= c3 + 1; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
}
 Generation 1 Job[Compiled, GPU] (5, 4)/10 Time (us): best: 2042 median: 3471 worst: 6026
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
}
}
 Generation 1 Job[Compiled, GPU] (7, 6)/10 Time (us): best: 2042 median: 6026 worst: 7900
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c1 = 4 * b1; c1 <= 31; c1 += 8) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 3; c5 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c1 + c5][b2][c7] = (output[c4][c1 + c5][b2][c7] + input[c4][c1 + c5][t0 + 2*b2][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
 Generation 1 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 2042 median: 6026 worst: 7900
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= min(2, -3 * b1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][3*b1 + c5][c6][c7] = (output[c4][3*b1 + c5][c6][c7] + input[c4][3*b1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
 Generation 1 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 606 median: 5872 worst: 7900
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[4][20][5][11];
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
_output_0[c3][c4][c5][t0] = output[c3][20*b1 + c4][5*b2 + c5][t0];
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 9; c3 += 8) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 4; c6 += 1) {
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
}
}
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
output[c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
}
}
}
__syncthreads();
}
}
 Generation 1 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 606 median: 5872 worst: 7900 Generation 1 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 606 median: 3471 worst: 7900 Generation 1 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 606 median: 3471 worst: 7900
 Generation 2 Job[Compiled, GPU] (2, 1)/10 Time (us): best: 604 median: 604 worst: 604
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[4][20][5][11];
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
_output_0[c3][c4][c5][t0] = output[c3][20*b1 + c4][5*b2 + c5][t0];
}
}
}
__syncthreads();
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][-20*b1 + c5][-5*b2 + c6][c7] = (_output_0[c4][-20*b1 + c5][-5*b2 + c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
output[c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
}
}
}
__syncthreads();
}
}
 Generation 2 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 604 median: 604 worst: 604
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= min(2, -3 * b1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][3*b1 + c5][c6][c7] = (output[c4][3*b1 + c5][c6][c7] + input[c4][3*b1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[3][20][5][11];
__syncthreads();
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
_output_0[c3][0][0][t0] = output[3*b0 + c3][20*b1][5*b2][t0];
_output_0[c3][0][1][t0] = output[3*b0 + c3][20*b1][1 + 5*b2][t0];
_output_0[c3][0][2][t0] = output[3*b0 + c3][20*b1][2 + 5*b2][t0];
_output_0[c3][0][3][t0] = output[3*b0 + c3][20*b1][3 + 5*b2][t0];
_output_0[c3][0][4][t0] = output[3*b0 + c3][20*b1][4 + 5*b2][t0];
_output_0[c3][1][0][t0] = output[3*b0 + c3][1 + 20*b1][5*b2][t0];
_output_0[c3][1][1][t0] = output[3*b0 + c3][1 + 20*b1][1 + 5*b2][t0];
_output_0[c3][1][2][t0] = output[3*b0 + c3][1 + 20*b1][2 + 5*b2][t0];
_output_0[c3][1][3][t0] = output[3*b0 + c3][1 + 20*b1][3 + 5*b2][t0];
_output_0[c3][1][4][t0] = output[3*b0 + c3][1 + 20*b1][4 + 5*b2][t0];
_output_0[c3][2][0][t0] = output[3*b0 + c3][2 + 20*b1][5*b2][t0];
_output_0[c3][2][1][t0] = output[3*b0 + c3][2 + 20*b1][1 + 5*b2][t0];
_output_0[c3][2][2][t0] = output[3*b0 + c3][2 + 20*b1][2 + 5*b2][t0];
_output_0[c3][2][3][t0] = output[3*b0 + c3][2 + 20*b1][3 + 5*b2][t0];
_output_0[c3][2][4][t0] = output[3*b0 + c3][2 + 20*b1][4 + 5*b2][t0];
_output_0[c3][3][0][t0] = output[3*b0 + c3][3 + 20*b1][5*b2][t0];
_output_0[c3][3][1][t0] = output[3*b0 + c3][3 + 20*b1][1 + 5*b2][t0];
_output_0[c3][3][2][t0] = output[3*b0 + c3][3 + 20*b1][2 + 5*b2][t0];
_output_0[c3][3][3][t0] = output[3*b0 + c3][3 + 20*b1][3 + 5*b2][t0];
_output_0[c3][3][4][t0] = output[3*b0 + c3][3 + 20*b1][4 + 5*b2][t0];
_output_0[c3][4][0][t0] = output[3*b0 + c3][4 + 20*b1][5*b2][t0];
_output_0[c3][4][1][t0] = output[3*b0 + c3][4 + 20*b1][1 + 5*b2][t0];
_output_0[c3][4][2][t0] = output[3*b0 + c3][4 + 20*b1][2 + 5*b2][t0];
_output_0[c3][4][3][t0] = output[3*b0 + c3][4 + 20*b1][3 + 5*b2][t0];
_output_0[c3][4][4][t0] = output[3*b0 + c3][4 + 20*b1][4 + 5*b2][t0];
_output_0[c3][5][0][t0] = output[3*b0 + c3][5 + 20*b1][5*b2][t0];
_output_0[c3][5][1][t0] = output[3*b0 + c3][5 + 20*b1][1 + 5*b2][t0];
_output_0[c3][5][2][t0] = output[3*b0 + c3][5 + 20*b1][2 + 5*b2][t0];
_output_0[c3][5][3][t0] = output[3*b0 + c3][5 + 20*b1][3 + 5*b2][t0];
_output_0[c3][5][4][t0] = output[3*b0 + c3][5 + 20*b1][4 + 5*b2][t0];
_output_0[c3][6][0][t0] = output[3*b0 + c3][6 + 20*b1][5*b2][t0];
_output_0[c3][6][1][t0] = output[3*b0 + c3][6 + 20*b1][1 + 5*b2][t0];
_output_0[c3][6][2][t0] = output[3*b0 + c3][6 + 20*b1][2 + 5*b2][t0];
_output_0[c3][6][3][t0] = output[3*b0 + c3][6 + 20*b1][3 + 5*b2][t0];
_output_0[c3][6][4][t0] = output[3*b0 + c3][6 + 20*b1][4 + 5*b2][t0];
_output_0[c3][7][0][t0] = output[3*b0 + c3][7 + 20*b1][5*b2][t0];
_output_0[c3][7][1][t0] = output[3*b0 + c3][7 + 20*b1][1 + 5*b2][t0];
_output_0[c3][7][2][t0] = output[3*b0 + c3][7 + 20*b1][2 + 5*b2][t0];
_output_0[c3][7][3][t0] = output[3*b0 + c3][7 + 20*b1][3 + 5*b2][t0];
_output_0[c3][7][4][t0] = output[3*b0 + c3][7 + 20*b1][4 + 5*b2][t0];
_output_0[c3][8][0][t0] = output[3*b0 + c3][8 + 20*b1][5*b2][t0];
_output_0[c3][8][1][t0] = output[3*b0 + c3][8 + 20*b1][1 + 5*b2][t0];
_output_0[c3][8][2][t0] = output[3*b0 + c3][8 + 20*b1][2 + 5*b2][t0];
_output_0[c3][8][3][t0] = output[3*b0 + c3][8 + 20*b1][3 + 5*b2][t0];
_output_0[c3][8][4][t0] = output[3*b0 + c3][8 + 20*b1][4 + 5*b2][t0];
_output_0[c3][9][0][t0] = output[3*b0 + c3][9 + 20*b1][5*b2][t0];
_output_0[c3][9][1][t0] = output[3*b0 + c3][9 + 20*b1][1 + 5*b2][t0];
_output_0[c3][9][2][t0] = output[3*b0 + c3][9 + 20*b1][2 + 5*b2][t0];
_output_0[c3][9][3][t0] = output[3*b0 + c3][9 + 20*b1][3 + 5*b2][t0];
_output_0[c3][9][4][t0] = output[3*b0 + c3][9 + 20*b1][4 + 5*b2][t0];
_output_0[c3][10][0][t0] = output[3*b0 + c3][10 + 20*b1][5*b2][t0];
_output_0[c3][10][1][t0] = output[3*b0 + c3][10 + 20*b1][1 + 5*b2][t0];
_output_0[c3][10][2][t0] = output[3*b0 + c3][10 + 20*b1][2 + 5*b2][t0];
_output_0[c3][10][3][t0] = output[3*b0 + c3][10 + 20*b1][3 + 5*b2][t0];
_output_0[c3][10][4][t0] = output[3*b0 + c3][10 + 20*b1][4 + 5*b2][t0];
_output_0[c3][11][0][t0] = output[3*b0 + c3][11 + 20*b1][5*b2][t0];
_output_0[c3][11][1][t0] = output[3*b0 + c3][11 + 20*b1][1 + 5*b2][t0];
_output_0[c3][11][2][t0] = output[3*b0 + c3][11 + 20*b1][2 + 5*b2][t0];
_output_0[c3][11][3][t0] = output[3*b0 + c3][11 + 20*b1][3 + 5*b2][t0];
_output_0[c3][11][4][t0] = output[3*b0 + c3][11 + 20*b1][4 + 5*b2][t0];
if (b1 == 0) {
_output_0[c3][12][0][t0] = output[3*b0 + c3][12][5*b2][t0];
_output_0[c3][12][1][t0] = output[3*b0 + c3][12][1 + 5*b2][t0];
_output_0[c3][12][2][t0] = output[3*b0 + c3][12][2 + 5*b2][t0];
_output_0[c3][12][3][t0] = output[3*b0 + c3][12][3 + 5*b2][t0];
_output_0[c3][12][4][t0] = output[3*b0 + c3][12][4 + 5*b2][t0];
_output_0[c3][13][0][t0] = output[3*b0 + c3][13][5*b2][t0];
_output_0[c3][13][1][t0] = output[3*b0 + c3][13][1 + 5*b2][t0];
_output_0[c3][13][2][t0] = output[3*b0 + c3][13][2 + 5*b2][t0];
_output_0[c3][13][3][t0] = output[3*b0 + c3][13][3 + 5*b2][t0];
_output_0[c3][13][4][t0] = output[3*b0 + c3][13][4 + 5*b2][t0];
_output_0[c3][14][0][t0] = output[3*b0 + c3][14][5*b2][t0];
_output_0[c3][14][1][t0] = output[3*b0 + c3][14][1 + 5*b2][t0];
_output_0[c3][14][2][t0] = output[3*b0 + c3][14][2 + 5*b2][t0];
_output_0[c3][14][3][t0] = output[3*b0 + c3][14][3 + 5*b2][t0];
_output_0[c3][14][4][t0] = output[3*b0 + c3][14][4 + 5*b2][t0];
_output_0[c3][15][0][t0] = output[3*b0 + c3][15][5*b2][t0];
_output_0[c3][15][1][t0] = output[3*b0 + c3][15][1 + 5*b2][t0];
_output_0[c3][15][2][t0] = output[3*b0 + c3][15][2 + 5*b2][t0];
_output_0[c3][15][3][t0] = output[3*b0 + c3][15][3 + 5*b2][t0];
_output_0[c3][15][4][t0] = output[3*b0 + c3][15][4 + 5*b2][t0];
_output_0[c3][16][0][t0] = output[3*b0 + c3][16][5*b2][t0];
_output_0[c3][16][1][t0] = output[3*b0 + c3][16][1 + 5*b2][t0];
_output_0[c3][16][2][t0] = output[3*b0 + c3][16][2 + 5*b2][t0];
_output_0[c3][16][3][t0] = output[3*b0 + c3][16][3 + 5*b2][t0];
_output_0[c3][16][4][t0] = output[3*b0 + c3][16][4 + 5*b2][t0];
_output_0[c3][17][0][t0] = output[3*b0 + c3][17][5*b2][t0];
_output_0[c3][17][1][t0] = output[3*b0 + c3][17][1 + 5*b2][t0];
_output_0[c3][17][2][t0] = output[3*b0 + c3][17][2 + 5*b2][t0];
_output_0[c3][17][3][t0] = output[3*b0 + c3][17][3 + 5*b2][t0];
_output_0[c3][17][4][t0] = output[3*b0 + c3][17][4 + 5*b2][t0];
_output_0[c3][18][0][t0] = output[3*b0 + c3][18][5*b2][t0];
_output_0[c3][18][1][t0] = output[3*b0 + c3][18][1 + 5*b2][t0];
_output_0[c3][18][2][t0] = output[3*b0 + c3][18][2 + 5*b2][t0];
_output_0[c3][18][3][t0] = output[3*b0 + c3][18][3 + 5*b2][t0];
_output_0[c3][18][4][t0] = output[3*b0 + c3][18][4 + 5*b2][t0];
_output_0[c3][19][0][t0] = output[3*b0 + c3][19][5*b2][t0];
_output_0[c3][19][1][t0] = output[3*b0 + c3][19][1 + 5*b2][t0];
_output_0[c3][19][2][t0] = output[3*b0 + c3][19][2 + 5*b2][t0];
_output_0[c3][19][3][t0] = output[3*b0 + c3][19][3 + 5*b2][t0];
_output_0[c3][19][4][t0] = output[3*b0 + c3][19][4 + 5*b2][t0];
}
}
__syncthreads();
for (int c3 = 0; c3 <= 9; c3 += 8) {
for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) {
for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 4; c6 += 1) {
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[3*b0 + c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
}
}
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
output[3*b0 + c3][20*b1][5*b2][t0] = _output_0[c3][0][0][t0];
output[3*b0 + c3][20*b1][1 + 5*b2][t0] = _output_0[c3][0][1][t0];
output[3*b0 + c3][20*b1][2 + 5*b2][t0] = _output_0[c3][0][2][t0];
output[3*b0 + c3][20*b1][3 + 5*b2][t0] = _output_0[c3][0][3][t0];
output[3*b0 + c3][20*b1][4 + 5*b2][t0] = _output_0[c3][0][4][t0];
output[3*b0 + c3][1 + 20*b1][5*b2][t0] = _output_0[c3][1][0][t0];
output[3*b0 + c3][1 + 20*b1][1 + 5*b2][t0] = _output_0[c3][1][1][t0];
output[3*b0 + c3][1 + 20*b1][2 + 5*b2][t0] = _output_0[c3][1][2][t0];
output[3*b0 + c3][1 + 20*b1][3 + 5*b2][t0] = _output_0[c3][1][3][t0];
output[3*b0 + c3][1 + 20*b1][4 + 5*b2][t0] = _output_0[c3][1][4][t0];
output[3*b0 + c3][2 + 20*b1][5*b2][t0] = _output_0[c3][2][0][t0];
output[3*b0 + c3][2 + 20*b1][1 + 5*b2][t0] = _output_0[c3][2][1][t0];
output[3*b0 + c3][2 + 20*b1][2 + 5*b2][t0] = _output_0[c3][2][2][t0];
output[3*b0 + c3][2 + 20*b1][3 + 5*b2][t0] = _output_0[c3][2][3][t0];
output[3*b0 + c3][2 + 20*b1][4 + 5*b2][t0] = _output_0[c3][2][4][t0];
output[3*b0 + c3][3 + 20*b1][5*b2][t0] = _output_0[c3][3][0][t0];
output[3*b0 + c3][3 + 20*b1][1 + 5*b2][t0] = _output_0[c3][3][1][t0];
output[3*b0 + c3][3 + 20*b1][2 + 5*b2][t0] = _output_0[c3][3][2][t0];
output[3*b0 + c3][3 + 20*b1][3 + 5*b2][t0] = _output_0[c3][3][3][t0];
output[3*b0 + c3][3 + 20*b1][4 + 5*b2][t0] = _output_0[c3][3][4][t0];
output[3*b0 + c3][4 + 20*b1][5*b2][t0] = _output_0[c3][4][0][t0];
output[3*b0 + c3][4 + 20*b1][1 + 5*b2][t0] = _output_0[c3][4][1][t0];
output[3*b0 + c3][4 + 20*b1][2 + 5*b2][t0] = _output_0[c3][4][2][t0];
output[3*b0 + c3][4 + 20*b1][3 + 5*b2][t0] = _output_0[c3][4][3][t0];
output[3*b0 + c3][4 + 20*b1][4 + 5*b2][t0] = _output_0[c3][4][4][t0];
output[3*b0 + c3][5 + 20*b1][5*b2][t0] = _output_0[c3][5][0][t0];
output[3*b0 + c3][5 + 20*b1][1 + 5*b2][t0] = _output_0[c3][5][1][t0];
output[3*b0 + c3][5 + 20*b1][2 + 5*b2][t0] = _output_0[c3][5][2][t0];
output[3*b0 + c3][5 + 20*b1][3 + 5*b2][t0] = _output_0[c3][5][3][t0];
output[3*b0 + c3][5 + 20*b1][4 + 5*b2][t0] = _output_0[c3][5][4][t0];
output[3*b0 + c3][6 + 20*b1][5*b2][t0] = _output_0[c3][6][0][t0];
output[3*b0 + c3][6 + 20*b1][1 + 5*b2][t0] = _output_0[c3][6][1][t0];
output[3*b0 + c3][6 + 20*b1][2 + 5*b2][t0] = _output_0[c3][6][2][t0];
output[3*b0 + c3][6 + 20*b1][3 + 5*b2][t0] = _output_0[c3][6][3][t0];
output[3*b0 + c3][6 + 20*b1][4 + 5*b2][t0] = _output_0[c3][6][4][t0];
output[3*b0 + c3][7 + 20*b1][5*b2][t0] = _output_0[c3][7][0][t0];
output[3*b0 + c3][7 + 20*b1][1 + 5*b2][t0] = _output_0[c3][7][1][t0];
output[3*b0 + c3][7 + 20*b1][2 + 5*b2][t0] = _output_0[c3][7][2][t0];
output[3*b0 + c3][7 + 20*b1][3 + 5*b2][t0] = _output_0[c3][7][3][t0];
output[3*b0 + c3][7 + 20*b1][4 + 5*b2][t0] = _output_0[c3][7][4][t0];
output[3*b0 + c3][8 + 20*b1][5*b2][t0] = _output_0[c3][8][0][t0];
output[3*b0 + c3][8 + 20*b1][1 + 5*b2][t0] = _output_0[c3][8][1][t0];
output[3*b0 + c3][8 + 20*b1][2 + 5*b2][t0] = _output_0[c3][8][2][t0];
output[3*b0 + c3][8 + 20*b1][3 + 5*b2][t0] = _output_0[c3][8][3][t0];
output[3*b0 + c3][8 + 20*b1][4 + 5*b2][t0] = _output_0[c3][8][4][t0];
output[3*b0 + c3][9 + 20*b1][5*b2][t0] = _output_0[c3][9][0][t0];
output[3*b0 + c3][9 + 20*b1][1 + 5*b2][t0] = _output_0[c3][9][1][t0];
output[3*b0 + c3][9 + 20*b1][2 + 5*b2][t0] = _output_0[c3][9][2][t0];
output[3*b0 + c3][9 + 20*b1][3 + 5*b2][t0] = _output_0[c3][9][3][t0];
output[3*b0 + c3][9 + 20*b1][4 + 5*b2][t0] = _output_0[c3][9][4][t0];
output[3*b0 + c3][10 + 20*b1][5*b2][t0] = _output_0[c3][10][0][t0];
output[3*b0 + c3][10 + 20*b1][1 + 5*b2][t0] = _output_0[c3][10][1][t0];
output[3*b0 + c3][10 + 20*b1][2 + 5*b2][t0] = _output_0[c3][10][2][t0];
output[3*b0 + c3][10 + 20*b1][3 + 5*b2][t0] = _output_0[c3][10][3][t0];
output[3*b0 + c3][10 + 20*b1][4 + 5*b2][t0] = _output_0[c3][10][4][t0];
output[3*b0 + c3][11 + 20*b1][5*b2][t0] = _output_0[c3][11][0][t0];
output[3*b0 + c3][11 + 20*b1][1 + 5*b2][t0] = _output_0[c3][11][1][t0];
output[3*b0 + c3][11 + 20*b1][2 + 5*b2][t0] = _output_0[c3][11][2][t0];
output[3*b0 + c3][11 + 20*b1][3 + 5*b2][t0] = _output_0[c3][11][3][t0];
output[3*b0 + c3][11 + 20*b1][4 + 5*b2][t0] = _output_0[c3][11][4][t0];
if (b1 == 0) {
output[3*b0 + c3][12][5*b2][t0] = _output_0[c3][12][0][t0];
output[3*b0 + c3][12][1 + 5*b2][t0] = _output_0[c3][12][1][t0];
output[3*b0 + c3][12][2 + 5*b2][t0] = _output_0[c3][12][2][t0];
output[3*b0 + c3][12][3 + 5*b2][t0] = _output_0[c3][12][3][t0];
output[3*b0 + c3][12][4 + 5*b2][t0] = _output_0[c3][12][4][t0];
output[3*b0 + c3][13][5*b2][t0] = _output_0[c3][13][0][t0];
output[3*b0 + c3][13][1 + 5*b2][t0] = _output_0[c3][13][1][t0];
output[3*b0 + c3][13][2 + 5*b2][t0] = _output_0[c3][13][2][t0];
output[3*b0 + c3][13][3 + 5*b2][t0] = _output_0[c3][13][3][t0];
output[3*b0 + c3][13][4 + 5*b2][t0] = _output_0[c3][13][4][t0];
output[3*b0 + c3][14][5*b2][t0] = _output_0[c3][14][0][t0];
output[3*b0 + c3][14][1 + 5*b2][t0] = _output_0[c3][14][1][t0];
output[3*b0 + c3][14][2 + 5*b2][t0] = _output_0[c3][14][2][t0];
output[3*b0 + c3][14][3 + 5*b2][t0] = _output_0[c3][14][3][t0];
output[3*b0 + c3][14][4 + 5*b2][t0] = _output_0[c3][14][4][t0];
output[3*b0 + c3][15][5*b2][t0] = _output_0[c3][15][0][t0];
output[3*b0 + c3][15][1 + 5*b2][t0] = _output_0[c3][15][1][t0];
output[3*b0 + c3][15][2 + 5*b2][t0] = _output_0[c3][15][2][t0];
output[3*b0 + c3][15][3 + 5*b2][t0] = _output_0[c3][15][3][t0];
output[3*b0 + c3][15][4 + 5*b2][t0] = _output_0[c3][15][4][t0];
output[3*b0 + c3][16][5*b2][t0] = _output_0[c3][16][0][t0];
output[3*b0 + c3][16][1 + 5*b2][t0] = _output_0[c3][16][1][t0];
output[3*b0 + c3][16][2 + 5*b2][t0] = _output_0[c3][16][2][t0];
output[3*b0 + c3][16][3 + 5*b2][t0] = _output_0[c3][16][3][t0];
output[3*b0 + c3][16][4 + 5*b2][t0] = _output_0[c3][16][4][t0];
output[3*b0 + c3][17][5*b2][t0] = _output_0[c3][17][0][t0];
output[3*b0 + c3][17][1 + 5*b2][t0] = _output_0[c3][17][1][t0];
output[3*b0 + c3][17][2 + 5*b2][t0] = _output_0[c3][17][2][t0];
output[3*b0 + c3][17][3 + 5*b2][t0] = _output_0[c3][17][3][t0];
output[3*b0 + c3][17][4 + 5*b2][t0] = _output_0[c3][17][4][t0];
output[3*b0 + c3][18][5*b2][t0] = _output_0[c3][18][0][t0];
output[3*b0 + c3][18][1 + 5*b2][t0] = _output_0[c3][18][1][t0];
output[3*b0 + c3][18][2 + 5*b2][t0] = _output_0[c3][18][2][t0];
output[3*b0 + c3][18][3 + 5*b2][t0] = _output_0[c3][18][3][t0];
output[3*b0 + c3][18][4 + 5*b2][t0] = _output_0[c3][18][4][t0];
output[3*b0 + c3][19][5*b2][t0] = _output_0[c3][19][0][t0];
output[3*b0 + c3][19][1 + 5*b2][t0] = _output_0[c3][19][1][t0];
output[3*b0 + c3][19][2 + 5*b2][t0] = _output_0[c3][19][2][t0];
output[3*b0 + c3][19][3 + 5*b2][t0] = _output_0[c3][19][3][t0];
output[3*b0 + c3][19][4 + 5*b2][t0] = _output_0[c3][19][4][t0];
}
}
__syncthreads();
}
}
 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (5, 4)/10 Time (us): best: 604 median: 704 worst: 1052
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[2][32][10][11];
__syncthreads();
for (int c2 = 0; c2 <= 1; c2 += 1) {
for (int c3 = 0; c3 <= 31; c3 += 1) {
_output_0[c2][c3][t1][t0] = output[2*b0 + c2][c3][t1][t0];
}
}
__syncthreads();
for (int c4 = 0; c4 <= 1; c4 += 1) {
for (int c5 = 0; c5 <= 31; c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][c5][c6][c7] = (_output_0[c4][c5][c6][c7] + input[2*b0 + c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
__syncthreads();
for (int c2 = 0; c2 <= 1; c2 += 1) {
for (int c3 = 0; c3 <= 31; c3 += 1) {
output[2*b0 + c2][c3][t1][t0] = _output_0[c2][c3][t1][t0];
}
}
__syncthreads();
}
}
 Generation 2 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 604 median: 704 worst: 1052
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[2][16][10][11];
__syncthreads();
for (int c2 = 0; c2 <= 1; c2 += 1) {
for (int c3 = 0; c3 <= 15; c3 += 1) {
for (int c4 = t1; c4 <= 9; c4 += 3) {
_output_0[c2][c3][c4][t0] = output[2*b0 + c2][16*b1 + c3][c4][t0];
}
}
}
__syncthreads();
for (int c2 = 0; c2 <= 9; c2 += 4) {
for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) {
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] = (_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
__syncthreads();
for (int c2 = 0; c2 <= 1; c2 += 1) {
for (int c3 = 0; c3 <= 15; c3 += 1) {
for (int c4 = t1; c4 <= 9; c4 += 3) {
output[2*b0 + c2][16*b1 + c3][c4][t0] = _output_0[c2][c3][c4][t0];
}
}
}
__syncthreads();
}
}
 Generation 2 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 604 median: 704 worst: 1769 Generation 2 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 604 median: 704 worst: 1769
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[3][20][10][11];
for (int c1 = 0; c1 <= 31; c1 += 20) {
__syncthreads();
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
for (int c5 = t1; c5 <= 9; c5 += 5) {
for (int c6 = t0; c6 <= 9; c6 += 4) {
_output_0[c3][c4][c5][c6] = output[3*b0 + c3][c1 + c4][c5][c6];
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= 9; c3 += 8) {
for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) {
for (int c5 = 0; c5 <= min(19, -c1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (t1 == 0 && t2 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[3*b0 + c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) {
for (int c5 = t1; c5 <= 9; c5 += 5) {
for (int c6 = t0; c6 <= 9; c6 += 4) {
output[3*b0 + c3][c1 + c4][c5][c6] = _output_0[c3][c4][c5][c6];
}
}
}
}
__syncthreads();
}
}
}
 Generation 2 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 604 median: 704 worst: 1769
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c3 = 0; c3 <= 9; c3 += 8) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 3 * b1; c5 <= min(31, 3 * b1 + 2); c5 += 1) {
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
for (int c7 = c3; c7 <= min(9, c3 + 7); c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
 Generation 2 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 604 median: 704 worst: 1769 Generation 2 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 559 median: 704 worst: 1769 Generation 2 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 559 median: 704 worst: 1769
 Generation 3 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 558 median: 604 worst: 604 Generation 3 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 558 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[4][5][5][11];
__shared__ float32 _input_0[4][5][10][21];
__syncthreads();
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 9; c5 += 1) {
_input_0[c3][c4][c5][t0] = input[c3][5*b1 + c4][10*b2 + c5][t0];
}
}
}
if (t0 <= 9) {
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
_output_0[c3][c4][c5][t0] = output[c3][5*b1 + c4][5*b2 + c5][t0];
}
}
}
}
__syncthreads();
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 5 * b1; c5 <= min(31, 5 * b1 + 4); c5 += 1) {
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[c4][-5*b1 + c5][-5*b2 + c6][c7] = (_output_0[c4][-5*b1 + c5][-5*b2 + c6][c7] + _input_0[c4][-5*b1 + c5][t0 - 10*b2 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
__syncthreads();
if (t0 <= 9) {
for (int c3 = 0; c3 <= 3; c3 += 1) {
for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
output[c3][5*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
}
}
}
}
__syncthreads();
}
}
 Generation 3 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 558 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c1 = 0; c1 <= 31; c1 += 3) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= min(2, -c1 + 31); c5 += 1) {
for (int c6 = 0; c6 <= 9; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c1 + c5][c6][c7] = (output[c4][c1 + c5][c6][c7] + input[c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
 Generation 3 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 111 median: 604 worst: 640 Generation 3 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 111 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[2][16][10][11];
__syncthreads();
for (int c2 = 0; c2 <= 1; c2 += 1) {
for (int c3 = 0; c3 <= 15; c3 += 1) {
for (int c4 = t1; c4 <= 9; c4 += 3) {
_output_0[c2][c3][c4][t0] = output[2*b0 + c2][16*b1 + c3][c4][t0];
}
}
}
__syncthreads();
for (int c2 = 0; c2 <= 9; c2 += 4) {
for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) {
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) {
for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] = (_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
__syncthreads();
for (int c2 = 0; c2 <= 1; c2 += 1) {
for (int c3 = 0; c3 <= 15; c3 += 1) {
for (int c4 = t1; c4 <= 9; c4 += 3) {
output[2*b0 + c2][16*b1 + c3][c4][t0] = _output_0[c2][c3][c4][t0];
}
}
}
__syncthreads();
}
}
 Generation 3 Job[Compiled, GPU] (7, 6)/10 Time (us): best: 111 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
__shared__ float32 _output_0[3][20][5][11];
__syncthreads();
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
_output_0[c3][c4][c5][t0] = output[3*b0 + c3][20*b1 + c4][5*b2 + c5][t0];
}
}
}
__syncthreads();
for (int c4 = 3 * b0; c4 <= min(3, 3 * b0 + 2); c4 += 1) {
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
_output_0[-3*b0 + c4][-20*b1 + c5][-5*b2 + c6][c7] = (_output_0[-3*b0 + c4][-20*b1 + c5][-5*b2 + c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
__syncthreads();
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) {
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) {
for (int c5 = 0; c5 <= 4; c5 += 1) {
output[3*b0 + c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0];
}
}
}
__syncthreads();
}
}
 Generation 3 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 111 median: 604 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c4 = 3 * b0; c4 <= min(3, 3 * b0 + 2); c4 += 1) {
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
}
}
}
}
}
}
}
 Generation 3 Job[Compiled, GPU] (9, 8)/10 Time (us): best: 111 median: 558 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c1 = 8 * b1; c1 <= 31; c1 += 16) {
for (int c2 = 0; c2 <= 9; c2 += 4) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 0; c5 <= 7; c5 += 1) {
for (int c6 = 0; c6 <= min(3, -c2 + 9); c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c1 + c5][c2 + c6][c7] = (output[c4][c1 + c5][c2 + c6][c7] + input[c4][c1 + c5][t0 + 2*c2 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
}
 Generation 3 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 111 median: 558 worst: 640
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;
extern "C" {
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput);
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput);
for (int c2 = 0; c2 <= 9; c2 += 5) {
for (int c4 = 0; c4 <= 3; c4 += 1) {
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) {
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) {
for (int c7 = 0; c7 <= 9; c7 += 1) {
for (int c8 = 0; c8 <= 3; c8 += 1) {
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) {
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]);
}
__syncthreads();
}
}
}
}
}
}
}
}
 Generation 3 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 111 median: 558 worst: 640 Generation 3 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 111 median: 558 worst: 640
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment