Skip to content

Instantly share code, notes, and snippets.

@Bulat-Ziganshin
Created May 17, 2016 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Bulat-Ziganshin/41161246987d58d7f710b6b8fd284545 to your computer and use it in GitHub Desktop.
Save Bulat-Ziganshin/41161246987d58d7f710b6b8fd284545 to your computer and use it in GitHub Desktop.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define DATA_SIZE (1 << 29)
#define DATA_ACCESSES (1 << 8)
#define BLOCK_SIZE 128
#define BLOCKS_COUNT 1024
template<int COUNT, int PAGE_SIZE, typename T>
__launch_bounds__(BLOCK_SIZE, 3)
__global__ void kernel(T *data)
{
const int pages_count = DATA_SIZE / (PAGE_SIZE * sizeof(T));
const int bid = (blockIdx.x * BLOCK_SIZE + threadIdx.x) / PAGE_SIZE;
const int tid = (blockIdx.x * BLOCK_SIZE + threadIdx.x) % PAGE_SIZE;
unsigned int dummy[COUNT];
for (int c = 0; c < COUNT; c++)
{
dummy[c] = 0;
}
for (int i = 0; i < DATA_ACCESSES; i++)
{
#pragma unroll
for (int c = 0; c < COUNT; c++)
{
unsigned int page = ((bid * COUNT + c) * DATA_ACCESSES + i) * 1031;
unsigned int index = (page % pages_count) * PAGE_SIZE + tid;
T v = data[index];
unsigned int * va = reinterpret_cast<unsigned int *>(&v);
#pragma unroll
for (int j = 0; j < (sizeof(T) / sizeof(unsigned int)); j++)
{
dummy[c] ^= va[j];
}
}
}
if (bid != 1 << 24) return;
for (unsigned int c = 0; c < COUNT; c++)
{
reinterpret_cast<unsigned int *>(data)[COUNT * c + tid] = dummy[c];
}
}
int main()
{
uint4 * data;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc(reinterpret_cast<void**>(&data), DATA_SIZE);
const int count = 32;
const int page_size = 8;
kernel<count, page_size> << <BLOCKS_COUNT, BLOCK_SIZE >> >(data);
cudaDeviceSynchronize();
cudaEventRecord(start, nullptr);
kernel<count, page_size> << <BLOCKS_COUNT, BLOCK_SIZE >> >(data);
cudaEventRecord(stop, nullptr);
cudaDeviceSynchronize();
float duration;
cudaEventElapsedTime(&duration, start, stop);
printf("%f ms\n", duration);
printf("%f GB/s\n", (1000.0f/duration) * count * sizeof(uint4) * DATA_ACCESSES * BLOCKS_COUNT * BLOCK_SIZE / static_cast<float>(1 << 30));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment