Skip to content

Instantly share code, notes, and snippets.

@louchenyao
Last active July 31, 2020 01:49
Show Gist options
  • Save louchenyao/cde014599eba100abf67981ba0ade7e3 to your computer and use it in GitHub Desktop.
Save louchenyao/cde014599eba100abf67981ba0ade7e3 to your computer and use it in GitHub Desktop.
Benchmark of GPU sequential read performance
// nvcc bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70
// Result on V100:
// bench_seq max throughput: 769.856 GiB/s
// bench_seq_unroll max throughput: 833.067 GiB/s
#include <iostream>
__device__
int64_t reduce(int64_t *buf, uint64_t s) {
// The more effecient way is to reduce within the wrap firstly, but it's not the bottleneck
buf[threadIdx.x] = s;
__syncthreads();
for (int i = blockDim.x/2; i > 0; i /= 2) {
if (threadIdx.x < i) {
buf[threadIdx.x] += buf[threadIdx.x + i];
}
__syncthreads();
}
if (threadIdx.x == 0) {
return buf[0];
}
return 0;
}
__global__
void bench_seq(int *a, int n, int64_t *res) {
int items_per_block = n / gridDim.x;
unsigned long long s = 0;
for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x) {
s += a[offset + threadIdx.x];
}
__shared__ int64_t buf[512];
unsigned long long aggregate = reduce(buf, s);
if (threadIdx.x == 0) {
atomicAdd((unsigned long long*)res, aggregate);
}
}
__global__
void bench_seq_unroll(int *a, int n, int64_t *res) {
int items_per_block = n / gridDim.x;
int64_t s = 0;
int reg_a[4];
for (int offset = items_per_block * blockIdx.x; offset < items_per_block * (blockIdx.x+1); offset += blockDim.x*4) {
#pragma unroll
for (int i = 0; i < 4; i++) {
reg_a[i] = a[offset + i*blockDim.x + threadIdx.x];
}
#pragma unroll
for (int i = 0; i < 4; i++) {
s += reg_a[i];
}
}
__shared__ int64_t buf[512];
unsigned long long aggregate = reduce(buf, s);
if (threadIdx.x == 0) {
atomicAdd((unsigned long long*)res, aggregate);
}
}
#define BENCH(f, m) { \
float t; \
cudaEvent_t start, stop; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
cudaMemset(d_res, 0, sizeof(int64_t)); \
cudaEventRecord(start, 0); \
f<<<blocks, 512>>>(d_a, n, d_res); \
cudaEventRecord(stop, 0); \
cudaEventSynchronize(stop); \
cudaEventElapsedTime(&t, start,stop); \
int64_t h_res; \
cudaMemcpy(&h_res, d_res, sizeof(h_res), cudaMemcpyDeviceToHost); \
if (h_res != ans) { \
std::cout << "Wrong Result: " << h_res << std::endl; \
std::cout << "Expected: " << ans << std::endl; \
return 1; \
} \
double thr = double(4)*n/(1<<30)*1000/t; \
if (thr > m) { \
m = thr; \
} \
std::cout << "------------" << std::endl \
<< "| 📛: " << #f << std::endl\
<< "|Blocks: " << blocks \
<< "\tTime: " << t \
<< "\tThroughput: " << thr \
<< std::endl; \
}
int main(int argc, char** argv)
{
int n = 1024 * (1 << 20); // 4 GB
// generating
int *h_a = new int[n];
int *d_a;
int64_t *d_res;
int64_t ans = 0;
for (int i = 0; i < n; i++) {
h_a[i] = i;
ans += h_a[i];
}
cudaMalloc((void**)&d_a, sizeof(int) * n);
cudaMalloc((void**)&d_res, sizeof(int64_t));
cudaMemcpy(d_a, h_a, sizeof(int) * n, cudaMemcpyHostToDevice);
// run
double seq_max = 0, seq_unroll_max = 0;
for (int blocks = 8; blocks <= 8192 ; blocks *= 2) {
for (int i = 0; i < 2; i++) {
BENCH(bench_seq, seq_max);
BENCH(bench_seq_unroll, seq_unroll_max);
}
}
std::cout << "------------" << std::endl
<< "| bench_seq max throughput: " << seq_max << std::endl
<< "| bench_seq_unroll max throughput: " << seq_unroll_max << std::endl;
cudaFree(d_a);
cudaFree(d_res);
delete[] h_a;
return 0;
}
// nvcc -ptx -o bench_seq.ptx bench_seq.cu -O3 -gencode arch=compute_70,code=sm_70
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-27506705
// Cuda compilation tools, release 10.2, V10.2.89
// Based on LLVM 3.4svn
//
.version 6.5
.target sm_70
.address_size 64
// .globl _Z9bench_seqPiiPl
// _ZZ9bench_seqPiiPlE3buf has been demoted
// _ZZ16bench_seq_unrollPiiPlE3buf has been demoted
.visible .entry _Z9bench_seqPiiPl(
.param .u64 _Z9bench_seqPiiPl_param_0,
.param .u32 _Z9bench_seqPiiPl_param_1,
.param .u64 _Z9bench_seqPiiPl_param_2
)
{
.reg .pred %p<8>;
.reg .b32 %r<27>;
.reg .b64 %rd<21>;
// demoted variable
.shared .align 8 .b8 _ZZ9bench_seqPiiPlE3buf[4096];
ld.param.u64 %rd8, [_Z9bench_seqPiiPl_param_0];
ld.param.u32 %r11, [_Z9bench_seqPiiPl_param_1];
ld.param.u64 %rd5, [_Z9bench_seqPiiPl_param_2];
cvta.to.global.u64 %rd1, %rd8;
mov.u32 %r12, %nctaid.x;
div.u32 %r13, %r11, %r12;
mov.u32 %r14, %ctaid.x;
mul.lo.s32 %r25, %r14, %r13;
add.s32 %r15, %r14, 1;
mul.lo.s32 %r2, %r15, %r13;
mov.u32 %r3, %tid.x;
mov.u64 %rd20, 0;
mov.u32 %r4, %ntid.x;
setp.ge.u32 %p2, %r25, %r2;
@%p2 bra BB0_2;
// BBO_1 is the for loop
BB0_1:
add.s32 %r16, %r3, %r25;
mul.wide.u32 %rd9, %r16, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.s32 %rd11, [%rd10]; // load
add.s64 %rd20, %rd11, %rd20;
add.s32 %r25, %r4, %r25;
setp.lt.u32 %p3, %r25, %r2;
@%p3 bra BB0_1;
// the following is inlined reduce part
BB0_2:
shl.b32 %r17, %r3, 3;
mov.u32 %r18, _ZZ9bench_seqPiiPlE3buf;
add.s32 %r7, %r18, %r17;
st.shared.u64 [%r7], %rd20;
bar.sync 0;
shr.u32 %r26, %r4, 1;
setp.eq.s32 %p4, %r26, 0;
@%p4 bra BB0_6;
BB0_3:
setp.ge.u32 %p5, %r3, %r26;
@%p5 bra BB0_5;
add.s32 %r19, %r26, %r3;
shl.b32 %r20, %r19, 3;
add.s32 %r22, %r18, %r20;
ld.shared.u64 %rd12, [%r7];
ld.shared.u64 %rd13, [%r22];
add.s64 %rd14, %rd12, %rd13;
st.shared.u64 [%r7], %rd14;
BB0_5:
bar.sync 0;
shr.u32 %r23, %r26, 31;
add.s32 %r24, %r26, %r23;
shr.s32 %r10, %r24, 1;
setp.gt.s32 %p6, %r26, 1;
mov.u32 %r26, %r10;
@%p6 bra BB0_3;
BB0_6:
setp.eq.s32 %p1, %r3, 0;
setp.ne.s32 %p7, %r3, 0;
@%p7 bra BB0_8;
ld.shared.u64 %rd15, [_ZZ9bench_seqPiiPlE3buf];
cvta.to.global.u64 %rd16, %rd5;
selp.b64 %rd17, %rd15, 0, %p1;
atom.global.add.u64 %rd18, [%rd16], %rd17;
BB0_8:
ret;
}
// .globl _Z16bench_seq_unrollPiiPl
.visible .entry _Z16bench_seq_unrollPiiPl(
.param .u64 _Z16bench_seq_unrollPiiPl_param_0,
.param .u32 _Z16bench_seq_unrollPiiPl_param_1,
.param .u64 _Z16bench_seq_unrollPiiPl_param_2
)
{
.reg .pred %p<8>;
.reg .b32 %r<35>;
.reg .b64 %rd<33>;
// demoted variable
.shared .align 8 .b8 _ZZ16bench_seq_unrollPiiPlE3buf[4096];
ld.param.u64 %rd5, [_Z16bench_seq_unrollPiiPl_param_0];
ld.param.u32 %r15, [_Z16bench_seq_unrollPiiPl_param_1];
ld.param.u64 %rd6, [_Z16bench_seq_unrollPiiPl_param_2];
mov.u32 %r16, %nctaid.x;
div.u32 %r17, %r15, %r16;
mov.u32 %r18, %ctaid.x;
mul.lo.s32 %r33, %r18, %r17;
add.s32 %r19, %r18, 1;
mul.lo.s32 %r2, %r19, %r17;
mov.u64 %rd32, 0;
mov.u32 %r3, %ntid.x;
setp.ge.u32 %p2, %r33, %r2;
@%p2 bra BB1_3;
cvta.to.global.u64 %rd1, %rd5;
shl.b32 %r4, %r3, 2;
shl.b32 %r5, %r3, 1;
mul.lo.s32 %r6, %r3, 3;
mov.u64 %rd32, 0;
mov.u32 %r7, %tid.x;
BB1_2:
add.s32 %r20, %r33, %r7;
mul.wide.u32 %rd9, %r20, 4;
add.s64 %rd10, %rd1, %rd9;
add.s32 %r21, %r20, %r3;
mul.wide.u32 %rd11, %r21, 4;
add.s64 %rd12, %rd1, %rd11;
add.s32 %r22, %r20, %r5;
mul.wide.u32 %rd13, %r22, 4;
add.s64 %rd14, %rd1, %rd13;
add.s32 %r23, %r20, %r6;
mul.wide.u32 %rd15, %r23, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.s32 %rd17, [%rd10]; // load 1
add.s64 %rd18, %rd17, %rd32;
ld.global.s32 %rd19, [%rd12]; // load 2
add.s64 %rd20, %rd19, %rd18;
ld.global.s32 %rd21, [%rd14]; // load 3
add.s64 %rd22, %rd21, %rd20;
ld.global.s32 %rd23, [%rd16]; // load 4
add.s64 %rd32, %rd23, %rd22;
add.s32 %r33, %r4, %r33;
setp.lt.u32 %p3, %r33, %r2;
@%p3 bra BB1_2;
BB1_3:
mov.u32 %r10, %tid.x;
shl.b32 %r24, %r10, 3;
mov.u32 %r25, _ZZ16bench_seq_unrollPiiPlE3buf;
add.s32 %r11, %r25, %r24;
st.shared.u64 [%r11], %rd32;
bar.sync 0;
shr.u32 %r34, %r3, 1;
setp.eq.s32 %p4, %r34, 0;
@%p4 bra BB1_7;
BB1_4:
setp.ge.u32 %p5, %r10, %r34;
@%p5 bra BB1_6;
add.s32 %r27, %r34, %r10;
shl.b32 %r28, %r27, 3;
add.s32 %r30, %r25, %r28;
ld.shared.u64 %rd24, [%r11];
ld.shared.u64 %rd25, [%r30];
add.s64 %rd26, %rd24, %rd25;
st.shared.u64 [%r11], %rd26;
BB1_6:
bar.sync 0;
shr.u32 %r31, %r34, 31;
add.s32 %r32, %r34, %r31;
shr.s32 %r14, %r32, 1;
setp.gt.s32 %p6, %r34, 1;
mov.u32 %r34, %r14;
@%p6 bra BB1_4;
BB1_7:
setp.eq.s32 %p1, %r10, 0;
setp.ne.s32 %p7, %r10, 0;
@%p7 bra BB1_9;
ld.shared.u64 %rd27, [_ZZ16bench_seq_unrollPiiPlE3buf];
cvta.to.global.u64 %rd28, %rd6;
selp.b64 %rd29, %rd27, 0, %p1;
atom.global.add.u64 %rd30, [%rd28], %rd29;
BB1_9:
ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment