Skip to content

Instantly share code, notes, and snippets.

@WOnder93
Last active December 15, 2016 10:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save WOnder93/fed4929017e48f1f2aed0fda228e7bf8 to your computer and use it in GitHub Desktop.
Save WOnder93/fed4929017e48f1f2aed0fda228e7bf8 to your computer and use it in GitHub Desktop.
PB197 - project solution
/*
* Copyright (C) 2016, Ondrej Mosnacek <omosnacek@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* Informational benchmarks:
* 1024 x 1024: 26240 ME/s
* 2048 x 2048: 33520 ME/s
* 4096 x 4096: 36010 ME/s
* 8192 x 8192: 36790 ME/s
* 8192 x 1024: 35140 ME/s
* 1024 x 8192: 35150 ME/s
* 16384 x 256: 32920 ME/s
* 256 x 16384: 32950 ME/s
* 262144 x 32: 29340 ME/s
* 32 x 262144: 28790 ME/s
*/
#define WARP_SIZE 32 /* do not change! ;) */
/* begin tuned parameters */
#define VEC_SIZE 4 /* vector size for global memory reading; one of {1,2,4} */
#define WARP_ROWS 1 /* number of warps per block (in the Y axis) */
#define WARP_COLS 4 /* number of warps per block (in the X axis) */
#define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */
/* end tuned parameters */
/* rows/columns per block: */
#define BLOCK_ROWS (WARP_SIZE * WARP_ROWS)
#define BLOCK_COLS (WARP_SIZE * WARP_COLS)
#define VECS_PER_WARP (WARP_SIZE / VEC_SIZE)
__device__ void warp_exchange(int id, int mask, int mul, int *s0, int *s1)
{
int message = (id & mask) == 0 ? *s1 : *s0;
message = __shfl_xor(message, mask * mul, WARP_SIZE);
((id & mask) == 0 ? *s1 : *s0) = message;
}
/* vector-size-specific definitions: */
#if VEC_SIZE == 1
typedef int1 vec_type;
#define VEC_ZERO make_int1(0)
__device__ void vec_add(vec_type *a, const vec_type *b)
{
a->x += b->x;
}
__device__ int vec_sum(const vec_type *v)
{
return v->x;
}
__device__ int vec_shuffle(int tir, vec_type *v)
{
return v->x;
}
#elif VEC_SIZE == 2
typedef int2 vec_type;
#define VEC_ZERO make_int2(0, 0)
__device__ void vec_add(vec_type *a, const vec_type *b)
{
a->x += b->x;
a->y += b->y;
}
__device__ int vec_sum(const vec_type *v)
{
return v->x + v->y;
}
__device__ int vec_shuffle(int tir, vec_type *v)
{
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
return v->x + v->y;
}
#elif VEC_SIZE == 4
typedef int4 vec_type;
#define VEC_ZERO make_int4(0, 0, 0, 0)
__device__ void vec_add(vec_type *a, const vec_type *b)
{
a->x += b->x;
a->y += b->y;
a->z += b->z;
a->w += b->w;
}
__device__ int vec_sum(const vec_type *v)
{
return v->x + v->y + v->z + v->w;
}
__device__ int vec_shuffle(int tir, vec_type *v)
{
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
v->x += v->y;
warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w);
v->z += v->w;
warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z);
return v->x + v->z;
}
#else
#error "Invalid vector size!"
#endif
__global__ void sumTable(const vec_type *table,
int *sum_rows, int *sum_cols,
const int rows, const int cols)
{
int br = blockIdx.y;
int bc = blockIdx.x;
int ti = threadIdx.x;
#if WARP_COLS > 1
int tc = threadIdx.y;
#else
int tc = 0;
#endif
#if WARP_ROWS > 1
int tr = threadIdx.z;
#else
int tr = 0;
#endif
int tic = ti % VECS_PER_WARP;
int tir = ti / VECS_PER_WARP;
int base_c = bc * BLOCK_COLS + tc * WARP_SIZE;
int base_r = br * BLOCK_ROWS + tr * WARP_SIZE;
#if WARP_COLS > 1
if (base_c >= cols) {
return;
}
#endif
#if WARP_ROWS > 1
if (base_r >= rows) {
return;
}
#endif
/* a stack buffer for accumulating partial row sums on the fly:
* (it should get mapped to about log2(VECS_PER_WARP) registers
* by the compiler) */
int stack[VECS_PER_WARP];
int stack_pos = 0;
vec_type col_sums = VEC_ZERO;
/* load two values from global memory and push them onto the stack: */
#define LD2(i) \
do { \
vec_type v; \
int pos_r = base_r + (i) * 2 * VEC_SIZE + tir; \
int pos_c = base_c / VEC_SIZE + tic; \
v = table[(pos_r + 0 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \
stack[stack_pos++] = vec_sum(&v); \
vec_add(&col_sums, &v); \
v = table[(pos_r + 1 * VEC_SIZE) * (cols / VEC_SIZE) + pos_c]; \
stack[stack_pos++] = vec_sum(&v); \
vec_add(&col_sums, &v); \
} while (0)
/* pop two values from stack, exchange one of them with another thread,
* add them, and push the result onto the stack: */
#define ADD(i) \
do { \
int v1 = stack[--stack_pos]; \
int v0 = stack[--stack_pos]; \
warp_exchange(tic, 1 << (i), 1, &v0, &v1); \
stack[stack_pos++] = v0 + v1; \
} while(0)
/* load the values, re-distribute and accumulate partial row sums,
* and accumulate partial column sums: */
#if VEC_SIZE == 1
LD2( 0); ADD(0);
LD2( 1); ADD(0); ADD(1);
__threadfence_block();
LD2( 2); ADD(0);
LD2( 3); ADD(0); ADD(1); ADD(2);
__threadfence_block();
LD2( 4); ADD(0);
LD2( 5); ADD(0); ADD(1);
__threadfence_block();
LD2( 6); ADD(0);
LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3);
__threadfence_block();
LD2( 8); ADD(0);
LD2( 9); ADD(0); ADD(1);
__threadfence_block();
LD2(10); ADD(0);
LD2(11); ADD(0); ADD(1); ADD(2);
__threadfence_block();
LD2(12); ADD(0);
LD2(13); ADD(0); ADD(1);
__threadfence_block();
LD2(14); ADD(0);
LD2(15); ADD(0); ADD(1); ADD(2); ADD(3); ADD(4);
#elif VEC_SIZE == 2
LD2( 0); ADD(0);
LD2( 1); ADD(0); ADD(1);
__threadfence_block();
LD2( 2); ADD(0);
LD2( 3); ADD(0); ADD(1); ADD(2);
__threadfence_block();
LD2( 4); ADD(0);
LD2( 5); ADD(0); ADD(1);
__threadfence_block();
LD2( 6); ADD(0);
LD2( 7); ADD(0); ADD(1); ADD(2); ADD(3);
#elif VEC_SIZE == 4
LD2( 0); ADD(0);
LD2( 1); ADD(0); ADD(1);
__threadfence_block();
LD2( 2); ADD(0);
LD2( 3); ADD(0); ADD(1); ADD(2);
#else
#error "Invalid vector size!"
#endif
/* The above mess is equivalent to the following code, but
* unfortunately NVCC fails to unroll the inner loop...
* NOTE: The bounds for 'k' are the ruler function [1] of (i + 1).
* [1] https://oeis.org/A001511 */
/*
#pragma unroll
for (int i = 0; i < VECS_PER_WARP / 2; i++) {
LD2(i);
#pragma unroll
for (int k = 0; (1 << k) <= ((i + 1) & -(i + 1)); k++) {
ADD(k);
}
if (i % 2 == 1) {
__threadfence_block();
}
}
*/
#undef LD2
#undef ADD
/* re-distribute and accumulate the partial column sums: */
int col_sum = vec_shuffle(tir, &col_sums);
/* we already have our row sum on the stack: */
int row_sum = stack[0];
/* update the global average corresp. to our chunk's row/column sum: */
atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum);
atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum);
}
#define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE)
__global__ void toAverage2(float *avgs1, const int *sums1,
const float mul1, const int limit1,
float *avgs2, const int *sums2,
const float mul2, const int limit2)
{
int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x;
if (pos < limit1) {
avgs1[pos] = (float)sums1[pos] * mul1;
} else {
pos -= limit1;
if (pos < limit2) {
avgs2[pos] = (float)sums2[pos] * mul2;
}
}
}
#define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block))
void solveGPU(
const int *results,
float *avg_stud, float *avg_que,
const int students, const int questions)
{
/* compute the total number of blocks in X/Y axis: */
int blocksY = BLOCK_COUNT(students, BLOCK_ROWS);
int blocksX = BLOCK_COUNT(questions, BLOCK_COLS);
/* clear result arrays, so threads can just add to them: */
cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0]));
cudaMemset(avg_que, 0, questions*sizeof(avg_que[0]));
/* compute column & row sums: */
sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>(
(const vec_type *)results, (int *)avg_stud, (int *)avg_que,
students, questions);
/* convert sums to averages: */
int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS);
toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>(
avg_stud, (int *)avg_stud, 1.0F / questions, students,
avg_que, (int *)avg_que, 1.0F / students, questions);
}
/*
* Copyright (C) 2016, Ondrej Mosnacek <omosnacek@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* Informational benchmarks:
* 1024 x 1024: 26120 ME/s
* 2048 x 2048: 33480 ME/s
* 4096 x 4096: 35940 ME/s
* 8192 x 8192: 36780 ME/s
* 8192 x 1024: 35110 ME/s
* 1024 x 8192: 35120 ME/s
* 16384 x 256: 32820 ME/s
* 256 x 16384: 32910 ME/s
* 262144 x 32: 29430 ME/s
* 32 x 262144: 29090 ME/s
*/
#define WARP_SIZE 32 /* do not change! ;) */
/* begin tuned parameters */
#define VEC_SIZE 4 /* vector size for global memory reading; one of {1,2,4} */
#define WARP_ROWS 2 /* number of warps per block (in the Y axis) */
#define WARP_COLS 2 /* number of warps per block (in the X axis) */
#define FENCE_INTERVAL 2 /* how often to do a memory fence */
#define TOAVG_BLOCK_SIZE 4 /* block size (in warps) for the toAverage kernel */
/* end tuned parameters */
/* rows/columns per block: */
#define BLOCK_ROWS (WARP_SIZE * WARP_ROWS)
#define BLOCK_COLS (WARP_SIZE * WARP_COLS)
#define VECS_PER_WARP (WARP_SIZE / VEC_SIZE)
__device__ void warp_exchange(int id, int mask, int mul, int *s0, int *s1)
{
int message = (id & mask) == 0 ? *s1 : *s0;
message = __shfl_xor(message, mask * mul, WARP_SIZE);
((id & mask) == 0 ? *s1 : *s0) = message;
}
/* vector-size-specific definitions: */
#if VEC_SIZE == 1
typedef int1 vec_type;
#define VEC_ZERO make_int1(0)
__device__ void vec_add(vec_type *a, const vec_type *b)
{
a->x += b->x;
}
__device__ int vec_sum(const vec_type *v)
{
return v->x;
}
__device__ int vec_shuffle(int tir, vec_type *v)
{
return v->x;
}
#elif VEC_SIZE == 2
typedef int2 vec_type;
#define VEC_ZERO make_int2(0, 0)
__device__ void vec_add(vec_type *a, const vec_type *b)
{
a->x += b->x;
a->y += b->y;
}
__device__ int vec_sum(const vec_type *v)
{
return v->x + v->y;
}
__device__ int vec_shuffle(int tir, vec_type *v)
{
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
return v->x + v->y;
}
#elif VEC_SIZE == 4
typedef int4 vec_type;
#define VEC_ZERO make_int4(0, 0, 0, 0)
__device__ void vec_add(vec_type *a, const vec_type *b)
{
a->x += b->x;
a->y += b->y;
a->z += b->z;
a->w += b->w;
}
__device__ int vec_sum(const vec_type *v)
{
return v->x + v->y + v->z + v->w;
}
__device__ int vec_shuffle(int tir, vec_type *v)
{
warp_exchange(tir, 1, VECS_PER_WARP, &v->x, &v->y);
v->x += v->y;
warp_exchange(tir, 1, VECS_PER_WARP, &v->z, &v->w);
v->z += v->w;
warp_exchange(tir, 2, VECS_PER_WARP, &v->x, &v->z);
return v->x + v->z;
}
#else
#error "Invalid vector size!"
#endif
__global__ void sumTable(const vec_type *table,
int *sum_rows, int *sum_cols,
const int rows, const int cols)
{
int br = blockIdx.y;
int bc = blockIdx.x;
int ti = threadIdx.x;
int tc = threadIdx.y;
int tr = threadIdx.z;
int tic = ti % VECS_PER_WARP;
int tir = ti / VECS_PER_WARP;
int base_c = bc * BLOCK_COLS + tc * WARP_SIZE;
int base_r = br * BLOCK_ROWS + tr * WARP_SIZE;
#if WARP_COLS > 1
if (base_c >= cols) {
return;
}
#endif
#if WARP_ROWS > 1
if (base_r >= rows) {
return;
}
#endif
vec_type col_sums = VEC_ZERO;
int row_sum = 0;
#pragma unroll
for (int i = 0; i < VECS_PER_WARP; i++) {
/* load next row: */
int pos_r = base_r + i * VEC_SIZE + tir;
int pos_c = base_c / VEC_SIZE + tic;
vec_type item = table[pos_r * (cols / VEC_SIZE) + pos_c];
/* accumulate row sum using the butterfly reduction
* (stolen from the CUDA Programming Guide): */
int value = vec_sum(&item);
#pragma unroll
for (int k = 1; k < VECS_PER_WARP; k *= 2) {
value += __shfl_xor(value, k, WARP_SIZE);
}
if (i == tic) {
row_sum = value;
}
/* increment partial column sums: */
vec_add(&col_sums, &item);
/* perform a block-level memory fence to limit reordering
* of global loads: */
if (i % FENCE_INTERVAL == (FENCE_INTERVAL - 1)) {
__threadfence_block();
}
}
/* re-distribute and accumulate the partial column sums: */
int col_sum = vec_shuffle(tir, &col_sums);
/* update the global average corresp. to our chunk's row/column sum: */
atomicAdd(&sum_rows[base_r + tic * VEC_SIZE + tir], row_sum);
atomicAdd(&sum_cols[base_c + tic * VEC_SIZE + tir], col_sum);
}
#define TOAVG_BLOCK_ITEMS (TOAVG_BLOCK_SIZE * WARP_SIZE)
__global__ void toAverage2(float *avgs1, const int *sums1,
const float mul1, const int limit1,
float *avgs2, const int *sums2,
const float mul2, const int limit2)
{
int pos = blockIdx.x * TOAVG_BLOCK_ITEMS + threadIdx.x;
if (pos < limit1) {
avgs1[pos] = (float)sums1[pos] * mul1;
} else {
pos -= limit1;
if (pos < limit2) {
avgs2[pos] = (float)sums2[pos] * mul2;
}
}
}
#define BLOCK_COUNT(size, block) (((size) + (block) - 1) / (block))
void solveGPU(
const int *results,
float *avg_stud, float *avg_que,
const int students, const int questions)
{
/* compute the total number of blocks in X/Y axis: */
int blocksY = BLOCK_COUNT(students, BLOCK_ROWS);
int blocksX = BLOCK_COUNT(questions, BLOCK_COLS);
/* clear result arrays, so threads can just add to them: */
cudaMemset(avg_stud, 0, students*sizeof(avg_stud[0]));
cudaMemset(avg_que, 0, questions*sizeof(avg_que[0]));
/* compute column & row sums: */
sumTable<<<dim3(blocksX, blocksY), dim3(WARP_SIZE, WARP_COLS, WARP_ROWS)>>>(
(const vec_type *)results, (int *)avg_stud, (int *)avg_que,
students, questions);
/* convert sums to averages: */
int blocks = BLOCK_COUNT(students + questions, TOAVG_BLOCK_ITEMS);
toAverage2<<<blocks, TOAVG_BLOCK_ITEMS>>>(
avg_stud, (int *)avg_stud, 1.0F / questions, students,
avg_que, (int *)avg_que, 1.0F / students, questions);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment