Last active
September 17, 2024 18:34
-
-
Save allanmac/4ff11985c3562830989f to your computer and use it in GitHub Desktop.
A tiny example of CUDA + OpenGL interop with write-only surfaces and CUDA kernels. Uses GLFW+GLAD.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// | |
// | |
#include <stdlib.h> | |
#include <stdio.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
// | |
// | |
// | |
cudaError_t | |
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
if (abort) | |
{ | |
cudaDeviceReset(); | |
exit(code); | |
} | |
} | |
return code; | |
} | |
// | |
// | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// | |
// | |
#pragma once | |
// | |
// | |
// | |
#include <cuda_runtime.h> | |
#include <stdbool.h> | |
// | |
// Beware that NVCC doesn't work with C files and __VA_ARGS__ | |
// | |
cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort); | |
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); | |
// | |
// | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// | |
// | |
#include <glad/glad.h> | |
#include <GLFW/glfw3.h> | |
#include <cuda_gl_interop.h> | |
#include <stdlib.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
#include "interop.h" | |
// | |
// | |
// | |
struct pxl_interop | |
{ | |
// split GPUs? | |
bool multi_gpu; | |
// number of fbo's | |
int count; | |
int index; | |
// w x h | |
int width; | |
int height; | |
// GL buffers | |
GLuint* fb; | |
GLuint* rb; | |
// CUDA resources | |
cudaGraphicsResource_t* cgr; | |
cudaArray_t* ca; | |
}; | |
// | |
// | |
// | |
struct pxl_interop* | |
pxl_interop_create(const bool multi_gpu, const int fbo_count) | |
{ | |
struct pxl_interop* const interop = calloc(1,sizeof(*interop)); | |
interop->multi_gpu = multi_gpu; | |
interop->count = fbo_count; | |
interop->index = 0; | |
// allocate arrays | |
interop->fb = calloc(fbo_count,sizeof(*(interop->fb ))); | |
interop->rb = calloc(fbo_count,sizeof(*(interop->rb ))); | |
interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr))); | |
interop->ca = calloc(fbo_count,sizeof(*(interop->ca))); | |
// render buffer object w/a color buffer | |
glCreateRenderbuffers(fbo_count,interop->rb); | |
// frame buffer object | |
glCreateFramebuffers(fbo_count,interop->fb); | |
// attach rbo to fbo | |
for (int index=0; index<fbo_count; index++) | |
{ | |
glNamedFramebufferRenderbuffer(interop->fb[index], | |
GL_COLOR_ATTACHMENT0, | |
GL_RENDERBUFFER, | |
interop->rb[index]); | |
} | |
// return it | |
return interop; | |
} | |
void | |
pxl_interop_destroy(struct pxl_interop* const interop) | |
{ | |
cudaError_t cuda_err; | |
// unregister CUDA resources | |
for (int index=0; index<interop->count; index++) | |
{ | |
if (interop->cgr[index] != NULL) | |
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
} | |
// delete rbo's | |
glDeleteRenderbuffers(interop->count,interop->rb); | |
// delete fbo's | |
glDeleteFramebuffers(interop->count,interop->fb); | |
// free buffers and resources | |
free(interop->fb); | |
free(interop->rb); | |
free(interop->cgr); | |
free(interop->ca); | |
// free interop | |
free(interop); | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height) | |
{ | |
cudaError_t cuda_err = cudaSuccess; | |
// save new size | |
interop->width = width; | |
interop->height = height; | |
// resize color buffer | |
for (int index=0; index<interop->count; index++) | |
{ | |
// unregister resource | |
if (interop->cgr[index] != NULL) | |
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
// resize rbo | |
glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height); | |
// probe fbo status | |
// glCheckNamedFramebufferStatus(interop->fb[index],0); | |
// register rbo | |
cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index], | |
interop->rb[index], | |
GL_RENDERBUFFER, | |
cudaGraphicsRegisterFlagsSurfaceLoadStore | | |
cudaGraphicsRegisterFlagsWriteDiscard)); | |
} | |
// map graphics resources | |
cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0)); | |
// get CUDA Array refernces | |
for (int index=0; index<interop->count; index++) | |
{ | |
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index], | |
interop->cgr[index], | |
0,0)); | |
} | |
// unmap graphics resources | |
cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0)); | |
return cuda_err; | |
} | |
void | |
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height) | |
{ | |
*width = interop->width; | |
*height = interop->height; | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream) | |
{ | |
if (!interop->multi_gpu) | |
return cudaSuccess; | |
// map graphics resources | |
return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream)); | |
} | |
cudaError_t | |
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream) | |
{ | |
if (!interop->multi_gpu) | |
return cudaSuccess; | |
return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream)); | |
} | |
cudaError_t | |
pxl_interop_array_map(struct pxl_interop* const interop) | |
{ | |
// | |
// FIXME -- IS THIS EVEN NEEDED? | |
// | |
cudaError_t cuda_err; | |
// get a CUDA Array | |
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index], | |
interop->cgr[interop->index], | |
0,0)); | |
return cuda_err; | |
} | |
// | |
// | |
// | |
cudaArray_const_t | |
pxl_interop_array_get(struct pxl_interop* const interop) | |
{ | |
return interop->ca[interop->index]; | |
} | |
int | |
pxl_interop_index_get(struct pxl_interop* const interop) | |
{ | |
return interop->index; | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_swap(struct pxl_interop* const interop) | |
{ | |
interop->index = (interop->index + 1) % interop->count; | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_clear(struct pxl_interop* const interop) | |
{ | |
/* | |
static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 }; | |
glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments); | |
*/ | |
const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color); | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_blit(struct pxl_interop* const interop) | |
{ | |
glBlitNamedFramebuffer(interop->fb[interop->index],0, | |
0,0, interop->width,interop->height, | |
0,interop->height,interop->width,0, | |
GL_COLOR_BUFFER_BIT, | |
GL_NEAREST); | |
} | |
// | |
// | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// | |
// | |
#pragma once | |
// | |
// | |
// | |
#include <cuda_runtime.h> | |
#include <stdbool.h> | |
// | |
// | |
// | |
struct pxl_interop* | |
pxl_interop_create(const bool multi_gpu, const int fbo_count); | |
void | |
pxl_interop_destroy(struct pxl_interop* const interop); | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height); | |
void | |
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height); | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream); | |
cudaError_t | |
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream); | |
cudaError_t | |
pxl_interop_array_map(struct pxl_interop* const interop); | |
// | |
// | |
// | |
cudaArray_const_t | |
pxl_interop_array_get(struct pxl_interop* const interop); | |
cudaStream_t | |
pxl_interop_stream_get(struct pxl_interop* const interop); | |
int | |
pxl_interop_index_get(struct pxl_interop* const interop); | |
// | |
// | |
// | |
void | |
pxl_interop_swap(struct pxl_interop* const interop); | |
void | |
pxl_interop_clear(struct pxl_interop* const interop); | |
void | |
pxl_interop_blit(struct pxl_interop* const interop); | |
// | |
// | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*- | |
// | |
// | |
// | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
#include "assert_cuda.h" | |
#ifdef __cplusplus | |
} | |
#endif | |
// | |
// | |
// | |
#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor | |
// | |
// | |
// | |
surface<void,cudaSurfaceType2D> surf; | |
// | |
// | |
// | |
union pxl_rgbx_24 | |
{ | |
uint1 b32; | |
struct { | |
unsigned r : 8; | |
unsigned g : 8; | |
unsigned b : 8; | |
unsigned na : 8; | |
}; | |
}; | |
// | |
// | |
// | |
extern "C" | |
__global__ | |
void | |
pxl_kernel(const int width, const int height) | |
{ | |
// pixel coordinates | |
const int idx = (blockDim.x * blockIdx.x) + threadIdx.x; | |
const int x = idx % width; | |
const int y = idx / width; | |
#if 1 | |
// pixel color | |
const int t = (unsigned int)clock() / 1100000; // 1.1 GHz | |
const int xt = (idx + t) % width; | |
const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f); | |
const unsigned int bar = ((y + t) / 32) & 3; | |
union pxl_rgbx_24 rgbx; | |
rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0; | |
rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0; | |
rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0; | |
rgbx.na = 255; | |
#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT | |
const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1); | |
union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 }; | |
#endif | |
surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock() | |
surf, | |
x*sizeof(rgbx), | |
y, | |
cudaBoundaryModeZero); // squelches out-of-bound writes | |
} | |
// | |
// | |
// | |
extern "C" | |
cudaError_t | |
pxl_kernel_launcher(cudaArray_const_t array, | |
const int width, | |
const int height, | |
cudaEvent_t event, | |
cudaStream_t stream) | |
{ | |
cudaError_t cuda_err; | |
// cuda_err = cudaEventRecord(event,stream); | |
cuda_err = cuda(BindSurfaceToArray(surf,array)); | |
if (cuda_err) | |
return cuda_err; | |
const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK; | |
// cuda_err = cudaEventRecord(event,stream); | |
if (blocks > 0) | |
pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height); | |
// cuda_err = cudaStreamWaitEvent(stream,event,0); | |
return cudaSuccess; | |
} | |
// | |
// | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// | |
// | |
#include <glad/glad.h> | |
#include <GLFW/glfw3.h> | |
// | |
// | |
// | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdbool.h> | |
// | |
// | |
// | |
#include <cuda_gl_interop.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
#include "interop.h" | |
// | |
// FPS COUNTER FROM HERE: | |
// | |
// http://antongerdelan.net/opengl/glcontext2.html | |
// | |
static | |
void | |
pxl_glfw_fps(GLFWwindow* window) | |
{ | |
// static fps counters | |
static double stamp_prev = 0.0; | |
static int frame_count = 0; | |
// locals | |
const double stamp_curr = glfwGetTime(); | |
const double elapsed = stamp_curr - stamp_prev; | |
if (elapsed > 0.5) | |
{ | |
stamp_prev = stamp_curr; | |
const double fps = (double)frame_count / elapsed; | |
int width, height; | |
char tmp[64]; | |
glfwGetFramebufferSize(window,&width,&height); | |
sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps); | |
glfwSetWindowTitle(window,tmp); | |
frame_count = 0; | |
} | |
frame_count++; | |
} | |
// | |
// | |
// | |
static | |
void | |
pxl_glfw_error_callback(int error, const char* description) | |
{ | |
fputs(description,stderr); | |
} | |
static | |
void | |
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods) | |
{ | |
if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) | |
glfwSetWindowShouldClose(window, GL_TRUE); | |
} | |
static | |
void | |
pxl_glfw_init(GLFWwindow** window, const int width, const int height) | |
{ | |
// | |
// INITIALIZE GLFW/GLAD | |
// | |
glfwSetErrorCallback(pxl_glfw_error_callback); | |
if (!glfwInit()) | |
exit(EXIT_FAILURE); | |
glfwWindowHint(GLFW_DEPTH_BITS, 0); | |
glfwWindowHint(GLFW_STENCIL_BITS, 0); | |
glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE); | |
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); | |
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5); | |
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); | |
#ifdef PXL_FULLSCREEN | |
GLFWmonitor* monitor = glfwGetPrimaryMonitor(); | |
const GLFWvidmode* mode = glfwGetVideoMode(monitor); | |
*window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL); | |
#else | |
*window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL); | |
#endif | |
if (*window == NULL) | |
{ | |
glfwTerminate(); | |
exit(EXIT_FAILURE); | |
} | |
glfwMakeContextCurrent(*window); | |
// set up GLAD | |
gladLoadGLLoader((GLADloadproc)glfwGetProcAddress); | |
// ignore vsync for now | |
glfwSwapInterval(0); | |
// only copy r/g/b | |
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); | |
// enable SRGB | |
// glEnable(GL_FRAMEBUFFER_SRGB); | |
} | |
// | |
// | |
// | |
static | |
void | |
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height) | |
{ | |
// get context | |
struct pxl_interop* const interop = glfwGetWindowUserPointer(window); | |
pxl_interop_size_set(interop,width,height); | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_kernel_launcher(cudaArray_const_t array, | |
const int width, | |
const int height, | |
cudaEvent_t event, | |
cudaStream_t stream); | |
// | |
// | |
// | |
int | |
main(int argc, char* argv[]) | |
{ | |
// | |
// INIT GLFW | |
// | |
GLFWwindow* window; | |
pxl_glfw_init(&window,1024,1024); | |
// | |
// INIT CUDA | |
// | |
cudaError_t cuda_err; | |
int gl_device_id,gl_device_count; | |
cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll)); | |
int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id; | |
cuda_err = cuda(SetDevice(cuda_device_id)); | |
// | |
// MULTI-GPU? | |
// | |
const bool multi_gpu = gl_device_id != cuda_device_id; | |
// | |
// INFO | |
// | |
struct cudaDeviceProp props; | |
cuda_err = cuda(GetDeviceProperties(&props,gl_device_id)); | |
printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id)); | |
printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
// | |
// CREATE CUDA STREAM & EVENT | |
// | |
cudaStream_t stream; | |
cudaEvent_t event; | |
cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior | |
cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming); | |
// | |
// CREATE INTEROP | |
// | |
// TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE | |
struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); | |
// | |
// RESIZE INTEROP | |
// | |
int width, height; | |
// get initial width/height | |
glfwGetFramebufferSize(window,&width,&height); | |
// resize with initial window dimensions | |
cuda_err = pxl_interop_size_set(interop,width,height); | |
// | |
// SET USER POINTER AND CALLBACKS | |
// | |
glfwSetWindowUserPointer (window,interop); | |
glfwSetKeyCallback (window,pxl_glfw_key_callback); | |
glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback); | |
// | |
// LOOP UNTIL DONE | |
// | |
while (!glfwWindowShouldClose(window)) | |
{ | |
// | |
// MONITOR FPS | |
// | |
pxl_glfw_fps(window); | |
// | |
// EXECUTE CUDA KERNEL ON RENDER BUFFER | |
// | |
int width,height; | |
cudaArray_t cuda_array; | |
pxl_interop_size_get(interop,&width,&height); | |
cuda_err = pxl_interop_map(interop,stream); | |
cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop), | |
width, | |
height, | |
event, | |
stream); | |
cuda_err = pxl_interop_unmap(interop,stream); | |
// | |
// BLIT & SWAP FBO | |
// | |
pxl_interop_blit(interop); | |
// pxl_interop_clear(interop); | |
pxl_interop_swap(interop); | |
// | |
// SWAP WINDOW | |
// | |
glfwSwapBuffers(window); | |
// | |
// PUMP/POLL/WAIT | |
// | |
glfwPollEvents(); // glfwWaitEvents(); | |
} | |
// | |
// CLEANUP | |
// | |
pxl_interop_destroy(interop); | |
glfwDestroyWindow(window); | |
glfwTerminate(); | |
cuda(DeviceReset()); | |
// missing some clean up here | |
exit(EXIT_SUCCESS); | |
} | |
// | |
// | |
// |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
IMHO, there has not been much changed on this topic. I think it is not something that many developers really need. Thank you very much for the resource.