Last active
October 10, 2019 15:27
-
-
Save allanmac/f91b67c112bcba98649d to your computer and use it in GitHub Desktop.
Measure achieved bandwidth when performing 128, 256 or 512 byte transactions on a multi-megabyte extent. This appears to reproduce @Genoil's original findings: https://devtalk.nvidia.com/default/topic/878455/cuda-programming-and-performance/gtx750ti-and-buffers-gt-1gb-on-win7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 64 -arch compute_30 -Xptxas=-v -o probe_bw probe_bw.cu"; -*- | |
// | |
// Copyright 2015 Allan MacKinnon <allanmac@alum.mit.edu> | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining | |
// a copy of this software and associated documentation files (the | |
// "Software"), to deal in the Software without restriction, including | |
// without limitation the rights to use, copy, modify, merge, publish, | |
// distribute, sublicense, and/or sell copies of the Software, and to | |
// permit persons to whom the Software is furnished to do so, subject to | |
// the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be | |
// included in all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
// | |
#include <stdio.h> | |
#include <stdint.h> | |
// | |
// choose a reasonable block size | |
// | |
#define PXL_WARPS_PER_BLOCK 4 | |
#define PXL_THREADS_PER_BLOCK (PXL_WARPS_PER_BLOCK * WARP_SIZE) | |
// | |
// launch the grid this many times and average | |
// | |
#define PXL_HOST_LOOPS 64 | |
// | |
// select 512, 256 or 128 byte transactions | |
// | |
#if 0 | |
#define PXL_SEGMENT_TYPE uint4 // low of ~9 GB/s | |
#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y ^ v.z ^ v.w) == 0xDEADBEEF) | |
#elif 1 | |
#define PXL_SEGMENT_TYPE uint2 // low of ~5 GB/s | |
#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y) == 0xDEADBEEF) | |
#else | |
#define PXL_SEGMENT_TYPE uint32_t // low of ~3 GB/s | |
#define PXL_SEGMENT_TEST(v) (v == 0xDEADBEEF) | |
#endif | |
// | |
// macro expand to get multiple loads in flight | |
// | |
#define PXL_DEVICE_LOOPS 7 | |
#define PXL_UNROLL() PXL_UNROLL_7() | |
// | |
// | |
// | |
#define PXL_UNROLL_6() \ | |
PXL_REPEAT(0); \ | |
PXL_REPEAT(1); \ | |
PXL_REPEAT(2); \ | |
PXL_REPEAT(3); \ | |
PXL_REPEAT(4); \ | |
PXL_REPEAT(5) | |
#define PXL_UNROLL_7() \ | |
PXL_UNROLL_6(); \ | |
PXL_REPEAT(6) | |
#define PXL_UNROLL_8() \ | |
PXL_UNROLL_7(); \ | |
PXL_REPEAT(7) | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define WARP_MASK (WARP_SIZE-1) | |
#define PXL_SEGMENT_SIZE (sizeof(PXL_SEGMENT_TYPE) * WARP_SIZE) | |
// | |
// | |
// | |
static | |
__device__ __host__ | |
uint32_t | |
lcg_parkmiller(uint32_t seed) | |
{ | |
return seed * 48271u; | |
} | |
// | |
// optionally use LDG/LDCS | |
// | |
#if 0 && (__CUDA_ARCH__ >= 350) | |
#define PXL_SEGMENT_LOAD(v,i) __ldg(v+i) | |
#elif (__CUDA_ARCH__ >= 320) | |
#define PXL_SEGMENT_LOAD(v,i) __ldcs(v+i) // .cs streaming load | |
#else | |
#define PXL_SEGMENT_LOAD(v,i) v[i] | |
#endif | |
// | |
// | |
// | |
static | |
__global__ | |
void | |
probe_kernel(const PXL_SEGMENT_TYPE* const vin, uint32_t* const vcounter, const uint32_t segments) | |
{ | |
const int32_t lid = threadIdx.x & WARP_MASK; | |
const uint32_t gid_x = blockIdx.y * blockIdx.x * blockDim.x + threadIdx.x; | |
const uint32_t seed = lcg_parkmiller(clock() * ~(gid_x + 1)); // must not be zero | |
#if __CUDA_ARCH__ < 300 | |
__shared__ uint32_t vidx[PXL_THREADS_PER_BLOCK]; | |
vidx[threadIdx.x] = seed % segments; | |
#define PXL_VIDX_LOAD(ii) vidx[(threadIdx.x & ~WARP_MASK) + ii] | |
#else | |
const uint32_t vidx = seed % segments; | |
#define PXL_VIDX_LOAD(ii) __shfl(vidx,ii) | |
#endif | |
#undef PXL_REPEAT | |
#define PXL_REPEAT(ii) \ | |
const PXL_SEGMENT_TYPE v##ii = PXL_SEGMENT_LOAD(vin,PXL_VIDX_LOAD(ii) * WARP_SIZE + lid) | |
PXL_UNROLL(); | |
#undef PXL_REPEAT | |
#define PXL_REPEAT(ii) \ | |
if (PXL_SEGMENT_TEST(v##ii)) \ | |
atomicInc(vcounter,UINT32_MAX) | |
PXL_UNROLL(); | |
} | |
// | |
// | |
// | |
#include <stdbool.h> | |
static | |
void | |
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
if (abort) | |
exit(code); | |
} | |
} | |
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); | |
// | |
// | |
// | |
static | |
void | |
probe_kernel_launcher(const size_t extent_mb, const size_t probe_mb) | |
{ | |
const size_t segments = extent_mb * 1024 * 1024 / PXL_SEGMENT_SIZE; | |
const size_t elems = segments * WARP_SIZE; | |
const size_t bytes = elems * sizeof(PXL_SEGMENT_TYPE); | |
// alloc | |
PXL_SEGMENT_TYPE* vin_d; | |
cuda(Malloc(&vin_d, bytes)); | |
cuda(Memset(vin_d,0,bytes)); | |
// size the grid | |
const size_t probes = probe_mb * 1024 * 1024 / PXL_SEGMENT_SIZE; | |
const size_t blocks = (probes + PXL_WARPS_PER_BLOCK - 1) / PXL_WARPS_PER_BLOCK; | |
// Fermi support -- 1 MB in dim.x and probe_mb in dim.y | |
const dim3 block_dim(1024*1024/PXL_SEGMENT_SIZE/PXL_WARPS_PER_BLOCK,probe_mb); | |
// warm-up | |
probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments); | |
// measure | |
cudaEvent_t start, end; | |
cuda(EventCreate(&start)); | |
cuda(EventCreate(&end)); | |
cuda(EventRecord(start)); | |
for (int ii=0; ii<PXL_HOST_LOOPS; ii++) | |
probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments); | |
cuda(EventRecord(end)); | |
cuda(EventSynchronize(end)); | |
float elapsed; | |
cuda(EventElapsedTime(&elapsed,start,end)); | |
const size_t loads = PXL_DEVICE_LOOPS * PXL_WARPS_PER_BLOCK * blocks; | |
printf( | |
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) | |
"%8llu, %8llu, %8.2f, %8.2f\n", | |
#else | |
"%8u, %8u, %8.2f, %8.2f\n", | |
#endif | |
extent_mb, | |
loads * PXL_SEGMENT_SIZE * PXL_HOST_LOOPS / (1024 * 1024), | |
elapsed, | |
(1000.0 * PXL_HOST_LOOPS * PXL_SEGMENT_SIZE * loads) / (1024.0 * 1024.0 * 1024.0 * elapsed)); | |
// | |
cuda(Free(vin_d)); | |
cuda(EventDestroy(start)); | |
cuda(EventDestroy(end)); | |
} | |
// | |
// | |
// | |
int | |
main(int argc, char** argv) | |
{ | |
// | |
if (argc < 4) | |
{ | |
printf("%s: <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>\n",argv[0]); | |
exit(0); | |
} | |
// | |
const int32_t device = atoi(argv[1]); | |
size_t lo_mb = max(atoi(argv[2]),1); | |
const size_t hi_mb = max(atoi(argv[3]),1); | |
const size_t step_mb = max(atoi(argv[4]),1); | |
const size_t probe_mb = max(atoi(argv[5]),1); | |
struct cudaDeviceProp props; | |
cuda(GetDeviceProperties(&props,device)); | |
printf("%s : %d SM : %d MB\n", | |
props.name, | |
props.multiProcessorCount, | |
props.totalGlobalMem / (1024 * 1024)); | |
cuda(SetDevice(device)); | |
printf("Probing from: " | |
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) | |
"%llu - %llu MB ...\n", | |
#else | |
"%u - %u MB ...\n", | |
#endif | |
lo_mb,hi_mb); | |
printf("alloc MB, probe MB, msecs, GB/s\n"); | |
for (; lo_mb <= hi_mb; lo_mb += step_mb) | |
probe_kernel_launcher(lo_mb,probe_mb); | |
// | |
cuda(DeviceReset()); | |
return 0; | |
} |
Usage:
probe_bw <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>
Example:
probe_bw 0 100 3500 100 32
GeForce GTX 980 : 16 SM : 4096 MB
Probing from: 100 - 3500 MB...
alloc MB, probe MB, msecs, GB/s
100, 14336, 81.71, 171.34
200, 14336, 83.00, 168.68
300, 14336, 83.49, 167.68
400, 14336, 83.72, 167.22
500, 14336, 83.86, 166.94
600, 14336, 83.94, 166.78
700, 14336, 84.03, 166.61
800, 14336, 84.08, 166.51
900, 14336, 84.17, 166.34
1000, 14336, 84.19, 166.28
1100, 14336, 84.28, 166.12
1200, 14336, 84.41, 165.85
1300, 14336, 84.81, 165.07
1400, 14336, 85.39, 163.95
1500, 14336, 85.94, 162.90
1600, 14336, 86.44, 161.96
1700, 14336, 86.89, 161.12
1800, 14336, 87.28, 160.41
1900, 14336, 87.63, 159.77
2000, 14336, 87.97, 159.14
2100, 14336, 151.29, 92.54
2200, 14336, 411.03, 34.06
2300, 14336, 655.55, 21.36
2400, 14336, 877.67, 15.95
2500, 14336, 1086.30, 12.89
2600, 14336, 1276.24, 10.97
2700, 14336, 1455.30, 9.62
2800, 14336, 1616.22, 8.66
2900, 14336, 1780.45, 7.86
3000, 14336, 1918.16, 7.30
3100, 14336, 2059.72, 6.80
3200, 14336, 2194.33, 6.38
3300, 14336, 2306.41, 6.07
3400, 14336, 2423.19, 5.78
3500, 14336, 2529.70, 5.53
Great job. We know 970 have 3.5G issue, but how come 980 have 2G random access bandwidth issue? Once I found ( Nov 2015 ) that AMD FURY have a even more serious random access issue ( penalty like only 40% of the theory bandwidth from HBM , even worse than a R9 390). Would you able to explain why it happens from hardware perspective?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Kepler/Maxwell only.
Build with:
nvcc -m 64 -arch compute_30 -o probe_bw probe_bw.cu
If you want to activate the "streaming load" cache modifier then compile with:
nvcc -m 64 -arch compute_32 -o probe_bw probe_bw.cu