Skip to content

Instantly share code, notes, and snippets.

@3outeille
Last active November 21, 2022 00:25
Show Gist options
  • Save 3outeille/2d437aec31bde6d95b615c8c7a65cb22 to your computer and use it in GitHub Desktop.
Save 3outeille/2d437aec31bde6d95b615c8c7a65cb22 to your computer and use it in GitHub Desktop.
CUDA experiment bank conflict shared memory (with a CMakeLists)
#include "stdio.h"
#include "utils.hh"
#include <cuda_runtime_api.h>
#define REPETITIONS 1
#define MEMORY_SIZE 2048
__global__ void kernel(int offset, bool is_debug) {
__shared__ uint32_t shared_memory[MEMORY_SIZE];
// init shared memory
if (threadIdx.x == 0) {
for (int i = 0; i < MEMORY_SIZE; i++)
shared_memory[i] = i;
}
__syncthreads();
uint32_t index = threadIdx.x * offset;
// 2048 / 32 = 64
for (int i = 0; i < 64; i++)
{
if (is_debug) {
if (threadIdx.x == 0)
{
printf("\n");
printf("Iteration %d:\n", i);
}
}
if (is_debug)
printf("\tthread %d: shared_memory[%d] = %d (addr %p) \n", threadIdx.x, index, shared_memory[index], &shared_memory[index]);
// Perform some computation to avoid compiler optimizations
shared_memory[index] += (uint32_t)(cosf(index * i));
shared_memory[index] *= index * i;
shared_memory[index] += (uint32_t)(sinf(index * i));;
shared_memory[index] *= (uint32_t)(sinf(index * i) + cosf(index));;
index += 32;
index %= MEMORY_SIZE;
__syncthreads();
if (is_debug)
printf("-");
}
}
int main(int argc, char **argv) {
int offset = 1;
bool is_debug = false;
if (argc > 2) {
offset = atoi(argv[1]);
std::string flag(argv[2]);
if (flag == "true")
is_debug = true;
}
cudaDeviceProp prop;
int device_count;
CHECK_CUDA_CALL(cudaGetDevice(&device_count));
CHECK_CUDA_CALL(cudaGetDeviceProperties(&prop, device_count));
if (is_debug) {
printf("Device name: %s\n", prop.name);
printf("Warp size: %d\n", prop.warpSize);
}
cudaSharedMemConfig shared_mem_config;
CHECK_CUDA_CALL(cudaDeviceGetSharedMemConfig(&shared_mem_config));
// Force bank size to be 4 bytes
if (shared_mem_config == cudaSharedMemBankSizeEightByte)
CHECK_CUDA_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
// print limit for printf buffer
size_t limit;
CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize));
if (is_debug)
printf("printf buffer limit: %zu\n", limit);
// Set limit to 4MB for printf buffer to avoid printing problems
CHECK_CUDA_CALL(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 4000000));
CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize));
if (is_debug)
printf("printf buffer limit: %zu\n", limit);
printf("Offset: %d\n", offset);
printf("Debug mode: %s\n", is_debug ? "true" : "false");
dim3 blocks = dim3(1);
dim3 threads = dim3(32);
float time = 0;
for (int i = 0; i < REPETITIONS; i++)
{
CudaTimer timer;
timer.start_time();
kernel<<<blocks, threads>>>(offset, is_debug); // A warp
CHECK_CUDA_CALL(cudaDeviceSynchronize());
CHECK_CUDA_CALL(cudaPeekAtLastError());
timer.stop_time();
time += timer.elapsed_time();
}
std::cout << "Average time: " << time / REPETITIONS << " ms" << std::endl;
return 0;
}
# To run
# mkdir build && cd build
# cmake ..
# make -j && ./bank conflict <offset> <is_debug>
cmake_minimum_required(VERSION 3.0)
set(CMAKE_CXX_FLAGS "-O3 -std=c++14")
set(CUDA_NVCC_FLAGS -arch=compute_52 -code=sm_75)
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
CUDA_ADD_EXECUTABLE(bank_conflict bank_conflict.cu)
target_link_libraries(bank_conflict ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY})
#pragma once
#include <iostream>
#include <cuda_runtime_api.h>
inline void checkCudaCall(cudaError_t error, const char* file, int line)
{
if (error)
{
std::cout << "CUDA error at " << file << ":" << line << std::endl;
std::cout << cudaGetErrorName(error) << " :: " << cudaGetErrorString(error) << std::endl;
}
}
#define CHECK_CUDA_CALL(err) (checkCudaCall(err, __FILE__, __LINE__))
struct CudaTimer
{
cudaEvent_t start;
cudaEvent_t stop;
CudaTimer()
{
CHECK_CUDA_CALL(cudaEventCreate(&start));
CHECK_CUDA_CALL(cudaEventCreate(&stop));
}
~CudaTimer()
{
CHECK_CUDA_CALL(cudaEventDestroy(start));
CHECK_CUDA_CALL(cudaEventDestroy(stop));
}
void start_time()
{
CHECK_CUDA_CALL(cudaEventRecord(start, 0));
CHECK_CUDA_CALL(cudaEventSynchronize(start));
}
void stop_time()
{
CHECK_CUDA_CALL(cudaEventRecord(stop, 0));
}
float elapsed_time()
{
float elapsed;
CHECK_CUDA_CALL(cudaEventSynchronize(stop));
CHECK_CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
return elapsed;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment