Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active October 10, 2019 15:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save allanmac/f91b67c112bcba98649d to your computer and use it in GitHub Desktop.
Save allanmac/f91b67c112bcba98649d to your computer and use it in GitHub Desktop.
Measure achieved bandwidth when performing 128, 256 or 512 byte transactions on a multi-megabyte extent. This appears to reproduce @Genoil's original findings: https://devtalk.nvidia.com/default/topic/878455/cuda-programming-and-performance/gtx750ti-and-buffers-gt-1gb-on-win7
// -*- compile-command: "nvcc -m 64 -arch compute_30 -Xptxas=-v -o probe_bw probe_bw.cu"; -*-
//
// Copyright 2015 Allan MacKinnon <allanmac@alum.mit.edu>
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#include <stdio.h>
#include <stdint.h>
//
// choose a reasonable block size
//
#define PXL_WARPS_PER_BLOCK 4
#define PXL_THREADS_PER_BLOCK (PXL_WARPS_PER_BLOCK * WARP_SIZE)
//
// launch the grid this many times and average
//
#define PXL_HOST_LOOPS 64
//
// select 512, 256 or 128 byte transactions
//
#if 0
#define PXL_SEGMENT_TYPE uint4 // low of ~9 GB/s
#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y ^ v.z ^ v.w) == 0xDEADBEEF)
#elif 1
#define PXL_SEGMENT_TYPE uint2 // low of ~5 GB/s
#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y) == 0xDEADBEEF)
#else
#define PXL_SEGMENT_TYPE uint32_t // low of ~3 GB/s
#define PXL_SEGMENT_TEST(v) (v == 0xDEADBEEF)
#endif
//
// macro expand to get multiple loads in flight
//
#define PXL_DEVICE_LOOPS 7
#define PXL_UNROLL() PXL_UNROLL_7()
//
//
//
#define PXL_UNROLL_6() \
PXL_REPEAT(0); \
PXL_REPEAT(1); \
PXL_REPEAT(2); \
PXL_REPEAT(3); \
PXL_REPEAT(4); \
PXL_REPEAT(5)
#define PXL_UNROLL_7() \
PXL_UNROLL_6(); \
PXL_REPEAT(6)
#define PXL_UNROLL_8() \
PXL_UNROLL_7(); \
PXL_REPEAT(7)
//
//
//
#define WARP_SIZE 32
#define WARP_MASK (WARP_SIZE-1)
#define PXL_SEGMENT_SIZE (sizeof(PXL_SEGMENT_TYPE) * WARP_SIZE)
//
//
//
static
__device__ __host__
uint32_t
lcg_parkmiller(uint32_t seed)
{
return seed * 48271u;
}
//
// optionally use LDG/LDCS
//
#if 0 && (__CUDA_ARCH__ >= 350)
#define PXL_SEGMENT_LOAD(v,i) __ldg(v+i)
#elif (__CUDA_ARCH__ >= 320)
#define PXL_SEGMENT_LOAD(v,i) __ldcs(v+i) // .cs streaming load
#else
#define PXL_SEGMENT_LOAD(v,i) v[i]
#endif
//
//
//
static
__global__
void
probe_kernel(const PXL_SEGMENT_TYPE* const vin, uint32_t* const vcounter, const uint32_t segments)
{
const int32_t lid = threadIdx.x & WARP_MASK;
const uint32_t gid_x = blockIdx.y * blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t seed = lcg_parkmiller(clock() * ~(gid_x + 1)); // must not be zero
#if __CUDA_ARCH__ < 300
__shared__ uint32_t vidx[PXL_THREADS_PER_BLOCK];
vidx[threadIdx.x] = seed % segments;
#define PXL_VIDX_LOAD(ii) vidx[(threadIdx.x & ~WARP_MASK) + ii]
#else
const uint32_t vidx = seed % segments;
#define PXL_VIDX_LOAD(ii) __shfl(vidx,ii)
#endif
#undef PXL_REPEAT
#define PXL_REPEAT(ii) \
const PXL_SEGMENT_TYPE v##ii = PXL_SEGMENT_LOAD(vin,PXL_VIDX_LOAD(ii) * WARP_SIZE + lid)
PXL_UNROLL();
#undef PXL_REPEAT
#define PXL_REPEAT(ii) \
if (PXL_SEGMENT_TEST(v##ii)) \
atomicInc(vcounter,UINT32_MAX)
PXL_UNROLL();
}
//
//
//
#include <stdbool.h>
static
void
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{
if (code != cudaSuccess)
{
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);
if (abort)
exit(code);
}
}
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);
//
//
//
static
void
probe_kernel_launcher(const size_t extent_mb, const size_t probe_mb)
{
const size_t segments = extent_mb * 1024 * 1024 / PXL_SEGMENT_SIZE;
const size_t elems = segments * WARP_SIZE;
const size_t bytes = elems * sizeof(PXL_SEGMENT_TYPE);
// alloc
PXL_SEGMENT_TYPE* vin_d;
cuda(Malloc(&vin_d, bytes));
cuda(Memset(vin_d,0,bytes));
// size the grid
const size_t probes = probe_mb * 1024 * 1024 / PXL_SEGMENT_SIZE;
const size_t blocks = (probes + PXL_WARPS_PER_BLOCK - 1) / PXL_WARPS_PER_BLOCK;
// Fermi support -- 1 MB in dim.x and probe_mb in dim.y
const dim3 block_dim(1024*1024/PXL_SEGMENT_SIZE/PXL_WARPS_PER_BLOCK,probe_mb);
// warm-up
probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments);
// measure
cudaEvent_t start, end;
cuda(EventCreate(&start));
cuda(EventCreate(&end));
cuda(EventRecord(start));
for (int ii=0; ii<PXL_HOST_LOOPS; ii++)
probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments);
cuda(EventRecord(end));
cuda(EventSynchronize(end));
float elapsed;
cuda(EventElapsedTime(&elapsed,start,end));
const size_t loads = PXL_DEVICE_LOOPS * PXL_WARPS_PER_BLOCK * blocks;
printf(
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
"%8llu, %8llu, %8.2f, %8.2f\n",
#else
"%8u, %8u, %8.2f, %8.2f\n",
#endif
extent_mb,
loads * PXL_SEGMENT_SIZE * PXL_HOST_LOOPS / (1024 * 1024),
elapsed,
(1000.0 * PXL_HOST_LOOPS * PXL_SEGMENT_SIZE * loads) / (1024.0 * 1024.0 * 1024.0 * elapsed));
//
cuda(Free(vin_d));
cuda(EventDestroy(start));
cuda(EventDestroy(end));
}
//
//
//
int
main(int argc, char** argv)
{
//
if (argc < 4)
{
printf("%s: <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>\n",argv[0]);
exit(0);
}
//
const int32_t device = atoi(argv[1]);
size_t lo_mb = max(atoi(argv[2]),1);
const size_t hi_mb = max(atoi(argv[3]),1);
const size_t step_mb = max(atoi(argv[4]),1);
const size_t probe_mb = max(atoi(argv[5]),1);
struct cudaDeviceProp props;
cuda(GetDeviceProperties(&props,device));
printf("%s : %d SM : %d MB\n",
props.name,
props.multiProcessorCount,
props.totalGlobalMem / (1024 * 1024));
cuda(SetDevice(device));
printf("Probing from: "
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
"%llu - %llu MB ...\n",
#else
"%u - %u MB ...\n",
#endif
lo_mb,hi_mb);
printf("alloc MB, probe MB, msecs, GB/s\n");
for (; lo_mb <= hi_mb; lo_mb += step_mb)
probe_kernel_launcher(lo_mb,probe_mb);
//
cuda(DeviceReset());
return 0;
}
@allanmac
Copy link
Author

allanmac commented Sep 30, 2015

Kepler/Maxwell only.

Build with: nvcc -m 64 -arch compute_30 -o probe_bw probe_bw.cu

If you want to activate the "streaming load" cache modifier then compile with:

nvcc -m 64 -arch compute_32 -o probe_bw probe_bw.cu

@allanmac
Copy link
Author

Usage:

probe_bw <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>

Example:

probe_bw 0 100 3500 100 32
GeForce GTX 980 : 16 SM : 4096 MB
Probing from: 100 - 3500 MB...
alloc MB, probe MB,    msecs,     GB/s
     100,    14336,    81.71,   171.34
     200,    14336,    83.00,   168.68
     300,    14336,    83.49,   167.68
     400,    14336,    83.72,   167.22
     500,    14336,    83.86,   166.94
     600,    14336,    83.94,   166.78
     700,    14336,    84.03,   166.61
     800,    14336,    84.08,   166.51
     900,    14336,    84.17,   166.34
    1000,    14336,    84.19,   166.28
    1100,    14336,    84.28,   166.12
    1200,    14336,    84.41,   165.85
    1300,    14336,    84.81,   165.07
    1400,    14336,    85.39,   163.95
    1500,    14336,    85.94,   162.90
    1600,    14336,    86.44,   161.96
    1700,    14336,    86.89,   161.12
    1800,    14336,    87.28,   160.41
    1900,    14336,    87.63,   159.77
    2000,    14336,    87.97,   159.14
    2100,    14336,   151.29,    92.54
    2200,    14336,   411.03,    34.06
    2300,    14336,   655.55,    21.36
    2400,    14336,   877.67,    15.95
    2500,    14336,  1086.30,    12.89
    2600,    14336,  1276.24,    10.97
    2700,    14336,  1455.30,     9.62
    2800,    14336,  1616.22,     8.66
    2900,    14336,  1780.45,     7.86
    3000,    14336,  1918.16,     7.30
    3100,    14336,  2059.72,     6.80
    3200,    14336,  2194.33,     6.38
    3300,    14336,  2306.41,     6.07
    3400,    14336,  2423.19,     5.78
    3500,    14336,  2529.70,     5.53

ebtyxd2

@smartbitcoin
Copy link

Great job. We know 970 have 3.5G issue, but how come 980 have 2G random access bandwidth issue? Once I found ( Nov 2015 ) that AMD FURY have a even more serious random access issue ( penalty like only 40% of the theory bandwidth from HBM , even worse than a R9 390). Would you able to explain why it happens from hardware perspective?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment