-
-
Save smartbitcoin/fab61877fc5b38483429 to your computer and use it in GitHub Desktop.
Measure achieved bandwidth when performing 128, 256 or 512 byte transactions on a multi-megabyte extent. This appears to reproduce @Genoil's original findings: https://devtalk.nvidia.com/default/topic/878455/cuda-programming-and-performance/gtx750ti-and-buffers-gt-1gb-on-win7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 64 -arch compute_30 -Xptxas=-v -o probe_bw probe_bw.cu"; -*- | |
// | |
#include <stdio.h> | |
#include <stdint.h> | |
// | |
// choose a reasonable block size | |
// | |
#define PXL_WARPS_PER_BLOCK 4 | |
#define PXL_THREADS_PER_BLOCK (PXL_WARPS_PER_BLOCK * WARP_SIZE) | |
// | |
// launch the grid this many times and average | |
// | |
#define PXL_HOST_LOOPS 64 | |
// | |
// select 512, 256 or 128 byte transactions | |
// | |
#if 0 | |
#define PXL_SEGMENT_TYPE uint4 // low of ~9 GB/s | |
#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y ^ v.z ^ v.w) == 0xDEADBEEF) | |
#elif 0 | |
#define PXL_SEGMENT_TYPE uint2 // low of ~5 GB/s | |
#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y) == 0xDEADBEEF) | |
#else | |
#define PXL_SEGMENT_TYPE uint32_t // low of ~3 GB/s | |
#define PXL_SEGMENT_TEST(v) (v == 0xDEADBEEF) | |
#endif | |
// | |
// macro expand to get multiple loads in flight | |
// | |
#define PXL_DEVICE_LOOPS 7 | |
#define PXL_UNROLL() PXL_UNROLL_7() | |
// | |
// | |
// | |
#define PXL_UNROLL_6() \ | |
PXL_REPEAT(0); \ | |
PXL_REPEAT(1); \ | |
PXL_REPEAT(2); \ | |
PXL_REPEAT(3); \ | |
PXL_REPEAT(4); \ | |
PXL_REPEAT(5) | |
#define PXL_UNROLL_7() \ | |
PXL_UNROLL_6(); \ | |
PXL_REPEAT(6) | |
#define PXL_UNROLL_8() \ | |
PXL_UNROLL_7(); \ | |
PXL_REPEAT(7) | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
#define WARP_MASK (WARP_SIZE-1) | |
#define PXL_SEGMENT_SIZE (sizeof(PXL_SEGMENT_TYPE) * WARP_SIZE) | |
// | |
// | |
// | |
static | |
__device__ __host__ | |
uint32_t | |
lcg_parkmiller(uint32_t seed) | |
{ | |
return seed * 48271u; | |
} | |
// | |
// optionally use LDG | |
// | |
#if 0 && (__CUDA_ARCH__ >= 350) | |
#define PXL_SEGMENT_LOAD(v,i) __ldg(v+i) | |
#else | |
#define PXL_SEGMENT_LOAD(v,i) v[i] | |
#endif | |
// | |
// | |
// | |
static | |
__global__ | |
void | |
probe_kernel(const PXL_SEGMENT_TYPE* const vin, uint32_t* const vcounter, const uint32_t segments) | |
{ | |
const int32_t lid = threadIdx.x & WARP_MASK; | |
const uint32_t gid_x = blockIdx.y * blockIdx.x * blockDim.x + threadIdx.x; | |
const uint32_t seed = lcg_parkmiller(clock() * ~(gid_x + 1)); // must not be zero | |
#if __CUDA_ARCH__ < 300 | |
__shared__ uint32_t vidx[PXL_THREADS_PER_BLOCK]; | |
vidx[threadIdx.x] = seed % segments; | |
#define PXL_VIDX_LOAD(ii) vidx[(threadIdx.x & ~WARP_MASK) + ii] | |
#else | |
const uint32_t vidx = seed % segments; | |
#define PXL_VIDX_LOAD(ii) __shfl(vidx,ii) | |
#endif | |
#undef PXL_REPEAT | |
#define PXL_REPEAT(ii) \ | |
const PXL_SEGMENT_TYPE v##ii = PXL_SEGMENT_LOAD(vin,PXL_VIDX_LOAD(ii) * WARP_SIZE + lid) | |
PXL_UNROLL(); | |
#undef PXL_REPEAT | |
#define PXL_REPEAT(ii) \ | |
if (PXL_SEGMENT_TEST(v##ii)) \ | |
atomicInc(vcounter,UINT32_MAX) | |
PXL_UNROLL(); | |
} | |
// | |
// | |
// | |
#include <stdbool.h> | |
static | |
void | |
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
if (abort) | |
exit(code); | |
} | |
} | |
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); | |
// | |
// | |
// | |
static | |
void | |
probe_kernel_launcher(const size_t extent_mb, const size_t probe_mb) | |
{ | |
const size_t segments = extent_mb * 1024 * 1024 / PXL_SEGMENT_SIZE; | |
const size_t elems = segments * WARP_SIZE; | |
const size_t bytes = elems * sizeof(PXL_SEGMENT_TYPE); | |
// alloc | |
PXL_SEGMENT_TYPE* vin_d; | |
cuda(Malloc(&vin_d, bytes)); | |
cuda(Memset(vin_d,0,bytes)); | |
// size the grid | |
const size_t probes = probe_mb * 1024 * 1024 / PXL_SEGMENT_SIZE; | |
const size_t blocks = (probes + PXL_WARPS_PER_BLOCK - 1) / PXL_WARPS_PER_BLOCK; | |
// Fermi support -- 1 MB in dim.x and probe_mb in dim.y | |
const dim3 block_dim(1024*1024/PXL_SEGMENT_SIZE/PXL_WARPS_PER_BLOCK,probe_mb); | |
// warm-up | |
probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments); | |
// measure | |
cudaEvent_t start, end; | |
cuda(EventCreate(&start)); | |
cuda(EventCreate(&end)); | |
cuda(EventRecord(start)); | |
for (int ii=0; ii<PXL_HOST_LOOPS; ii++) | |
probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments); | |
cuda(EventRecord(end)); | |
cuda(EventSynchronize(end)); | |
float elapsed; | |
cuda(EventElapsedTime(&elapsed,start,end)); | |
const size_t loads = PXL_DEVICE_LOOPS * PXL_WARPS_PER_BLOCK * blocks; | |
printf( | |
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) | |
"%8llu, %8llu, %8.2f, %8.2f\n", | |
#else | |
"%8u, %8u, %8.2f, %8.2f\n", | |
#endif | |
extent_mb, | |
loads * PXL_SEGMENT_SIZE * PXL_HOST_LOOPS / (1024 * 1024), | |
elapsed, | |
(1000.0 * PXL_HOST_LOOPS * PXL_SEGMENT_SIZE * loads) / (1024.0 * 1024.0 * 1024.0 * elapsed)); | |
// | |
cuda(Free(vin_d)); | |
cuda(EventDestroy(start)); | |
cuda(EventDestroy(end)); | |
} | |
// | |
// | |
// | |
int | |
main(int argc, char** argv) | |
{ | |
// | |
if (argc < 4) | |
{ | |
printf("%s: <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>\n",argv[0]); | |
exit(0); | |
} | |
// | |
const int32_t device = atoi(argv[1]); | |
size_t lo_mb = max(atoi(argv[2]),1); | |
const size_t hi_mb = max(atoi(argv[3]),1); | |
const size_t step_mb = max(atoi(argv[4]),1); | |
const size_t probe_mb = max(atoi(argv[5]),1); | |
struct cudaDeviceProp props; | |
cuda(GetDeviceProperties(&props,device)); | |
printf("%s : %d SM : %d MB\n", | |
props.name, | |
props.multiProcessorCount, | |
props.totalGlobalMem / (1024 * 1024)); | |
cuda(SetDevice(device)); | |
printf("Probing from: " | |
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) | |
"%llu - %llu MB ...\n", | |
#else | |
"%u - %u MB ...\n", | |
#endif | |
lo_mb,hi_mb); | |
printf("alloc MB, probe MB, msecs, GB/s\n"); | |
for (; lo_mb <= hi_mb; lo_mb += step_mb) | |
probe_kernel_launcher(lo_mb,probe_mb); | |
// | |
cuda(DeviceReset()); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment