Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active July 23, 2024 09:06
Show Gist options
  • Save allanmac/4ff11985c3562830989f to your computer and use it in GitHub Desktop.
Save allanmac/4ff11985c3562830989f to your computer and use it in GitHub Desktop.
A tiny example of CUDA + OpenGL interop with write-only surfaces and CUDA kernels. Uses GLFW+GLAD.
//
//
//
#include <stdlib.h>
#include <stdio.h>
//
//
//
#include "assert_cuda.h"
//
//
//
cudaError_t
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{
if (code != cudaSuccess)
{
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);
if (abort)
{
cudaDeviceReset();
exit(code);
}
}
return code;
}
//
//
//
//
//
//
#pragma once
//
//
//
#include <cuda_runtime.h>
#include <stdbool.h>
//
// Beware that NVCC doesn't work with C files and __VA_ARGS__
//
cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);
//
//
//
//
//
//
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cuda_gl_interop.h>
#include <stdlib.h>
//
//
//
#include "assert_cuda.h"
#include "interop.h"
//
//
//
struct pxl_interop
{
// split GPUs?
bool multi_gpu;
// number of fbo's
int count;
int index;
// w x h
int width;
int height;
// GL buffers
GLuint* fb;
GLuint* rb;
// CUDA resources
cudaGraphicsResource_t* cgr;
cudaArray_t* ca;
};
//
//
//
struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count)
{
struct pxl_interop* const interop = calloc(1,sizeof(*interop));
interop->multi_gpu = multi_gpu;
interop->count = fbo_count;
interop->index = 0;
// allocate arrays
interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));
// render buffer object w/a color buffer
glCreateRenderbuffers(fbo_count,interop->rb);
// frame buffer object
glCreateFramebuffers(fbo_count,interop->fb);
// attach rbo to fbo
for (int index=0; index<fbo_count; index++)
{
glNamedFramebufferRenderbuffer(interop->fb[index],
GL_COLOR_ATTACHMENT0,
GL_RENDERBUFFER,
interop->rb[index]);
}
// return it
return interop;
}
void
pxl_interop_destroy(struct pxl_interop* const interop)
{
cudaError_t cuda_err;
// unregister CUDA resources
for (int index=0; index<interop->count; index++)
{
if (interop->cgr[index] != NULL)
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
}
// delete rbo's
glDeleteRenderbuffers(interop->count,interop->rb);
// delete fbo's
glDeleteFramebuffers(interop->count,interop->fb);
// free buffers and resources
free(interop->fb);
free(interop->rb);
free(interop->cgr);
free(interop->ca);
// free interop
free(interop);
}
//
//
//
cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
{
cudaError_t cuda_err = cudaSuccess;
// save new size
interop->width = width;
interop->height = height;
// resize color buffer
for (int index=0; index<interop->count; index++)
{
// unregister resource
if (interop->cgr[index] != NULL)
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
// resize rbo
glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
// probe fbo status
// glCheckNamedFramebufferStatus(interop->fb[index],0);
// register rbo
cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
interop->rb[index],
GL_RENDERBUFFER,
cudaGraphicsRegisterFlagsSurfaceLoadStore |
cudaGraphicsRegisterFlagsWriteDiscard));
}
// map graphics resources
cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));
// get CUDA Array refernces
for (int index=0; index<interop->count; index++)
{
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
interop->cgr[index],
0,0));
}
// unmap graphics resources
cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));
return cuda_err;
}
void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
{
*width = interop->width;
*height = interop->height;
}
//
//
//
cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
{
if (!interop->multi_gpu)
return cudaSuccess;
// map graphics resources
return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
}
cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
{
if (!interop->multi_gpu)
return cudaSuccess;
return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
}
cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop)
{
//
// FIXME -- IS THIS EVEN NEEDED?
//
cudaError_t cuda_err;
// get a CUDA Array
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
interop->cgr[interop->index],
0,0));
return cuda_err;
}
//
//
//
cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop)
{
return interop->ca[interop->index];
}
int
pxl_interop_index_get(struct pxl_interop* const interop)
{
return interop->index;
}
//
//
//
void
pxl_interop_swap(struct pxl_interop* const interop)
{
interop->index = (interop->index + 1) % interop->count;
}
//
//
//
void
pxl_interop_clear(struct pxl_interop* const interop)
{
/*
static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
*/
const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
}
//
//
//
void
pxl_interop_blit(struct pxl_interop* const interop)
{
glBlitNamedFramebuffer(interop->fb[interop->index],0,
0,0, interop->width,interop->height,
0,interop->height,interop->width,0,
GL_COLOR_BUFFER_BIT,
GL_NEAREST);
}
//
//
//
//
//
//
#pragma once
//
//
//
#include <cuda_runtime.h>
#include <stdbool.h>
//
//
//
struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count);
void
pxl_interop_destroy(struct pxl_interop* const interop);
//
//
//
cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height);
void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height);
//
//
//
cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);
cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop);
//
//
//
cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop);
cudaStream_t
pxl_interop_stream_get(struct pxl_interop* const interop);
int
pxl_interop_index_get(struct pxl_interop* const interop);
//
//
//
void
pxl_interop_swap(struct pxl_interop* const interop);
void
pxl_interop_clear(struct pxl_interop* const interop);
void
pxl_interop_blit(struct pxl_interop* const interop);
//
//
//
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*-
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
#include "assert_cuda.h"
#ifdef __cplusplus
}
#endif
//
//
//
#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor
//
//
//
surface<void,cudaSurfaceType2D> surf;
//
//
//
union pxl_rgbx_24
{
uint1 b32;
struct {
unsigned r : 8;
unsigned g : 8;
unsigned b : 8;
unsigned na : 8;
};
};
//
//
//
extern "C"
__global__
void
pxl_kernel(const int width, const int height)
{
// pixel coordinates
const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
const int x = idx % width;
const int y = idx / width;
#if 1
// pixel color
const int t = (unsigned int)clock() / 1100000; // 1.1 GHz
const int xt = (idx + t) % width;
const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
const unsigned int bar = ((y + t) / 32) & 3;
union pxl_rgbx_24 rgbx;
rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0;
rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0;
rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0;
rgbx.na = 255;
#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };
#endif
surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
surf,
x*sizeof(rgbx),
y,
cudaBoundaryModeZero); // squelches out-of-bound writes
}
//
//
//
extern "C"
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
const int width,
const int height,
cudaEvent_t event,
cudaStream_t stream)
{
cudaError_t cuda_err;
// cuda_err = cudaEventRecord(event,stream);
cuda_err = cuda(BindSurfaceToArray(surf,array));
if (cuda_err)
return cuda_err;
const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
// cuda_err = cudaEventRecord(event,stream);
if (blocks > 0)
pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
// cuda_err = cudaStreamWaitEvent(stream,event,0);
return cudaSuccess;
}
//
//
//
//
//
//
#include <glad/glad.h>
#include <GLFW/glfw3.h>
//
//
//
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
//
//
//
#include <cuda_gl_interop.h>
//
//
//
#include "assert_cuda.h"
#include "interop.h"
//
// FPS COUNTER FROM HERE:
//
// http://antongerdelan.net/opengl/glcontext2.html
//
static
void
pxl_glfw_fps(GLFWwindow* window)
{
// static fps counters
static double stamp_prev = 0.0;
static int frame_count = 0;
// locals
const double stamp_curr = glfwGetTime();
const double elapsed = stamp_curr - stamp_prev;
if (elapsed > 0.5)
{
stamp_prev = stamp_curr;
const double fps = (double)frame_count / elapsed;
int width, height;
char tmp[64];
glfwGetFramebufferSize(window,&width,&height);
sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);
glfwSetWindowTitle(window,tmp);
frame_count = 0;
}
frame_count++;
}
//
//
//
static
void
pxl_glfw_error_callback(int error, const char* description)
{
fputs(description,stderr);
}
static
void
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
{
if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
glfwSetWindowShouldClose(window, GL_TRUE);
}
static
void
pxl_glfw_init(GLFWwindow** window, const int width, const int height)
{
//
// INITIALIZE GLFW/GLAD
//
glfwSetErrorCallback(pxl_glfw_error_callback);
if (!glfwInit())
exit(EXIT_FAILURE);
glfwWindowHint(GLFW_DEPTH_BITS, 0);
glfwWindowHint(GLFW_STENCIL_BITS, 0);
glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE);
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
#ifdef PXL_FULLSCREEN
GLFWmonitor* monitor = glfwGetPrimaryMonitor();
const GLFWvidmode* mode = glfwGetVideoMode(monitor);
*window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL);
#else
*window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
#endif
if (*window == NULL)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
glfwMakeContextCurrent(*window);
// set up GLAD
gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);
// ignore vsync for now
glfwSwapInterval(0);
// only copy r/g/b
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
// enable SRGB
// glEnable(GL_FRAMEBUFFER_SRGB);
}
//
//
//
static
void
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
{
// get context
struct pxl_interop* const interop = glfwGetWindowUserPointer(window);
pxl_interop_size_set(interop,width,height);
}
//
//
//
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
const int width,
const int height,
cudaEvent_t event,
cudaStream_t stream);
//
//
//
int
main(int argc, char* argv[])
{
//
// INIT GLFW
//
GLFWwindow* window;
pxl_glfw_init(&window,1024,1024);
//
// INIT CUDA
//
cudaError_t cuda_err;
int gl_device_id,gl_device_count;
cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));
int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
cuda_err = cuda(SetDevice(cuda_device_id));
//
// MULTI-GPU?
//
const bool multi_gpu = gl_device_id != cuda_device_id;
//
// INFO
//
struct cudaDeviceProp props;
cuda_err = cuda(GetDeviceProperties(&props,gl_device_id));
printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount);
cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id));
printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
//
// CREATE CUDA STREAM & EVENT
//
cudaStream_t stream;
cudaEvent_t event;
cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior
cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);
//
// CREATE INTEROP
//
// TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);
//
// RESIZE INTEROP
//
int width, height;
// get initial width/height
glfwGetFramebufferSize(window,&width,&height);
// resize with initial window dimensions
cuda_err = pxl_interop_size_set(interop,width,height);
//
// SET USER POINTER AND CALLBACKS
//
glfwSetWindowUserPointer (window,interop);
glfwSetKeyCallback (window,pxl_glfw_key_callback);
glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
//
// LOOP UNTIL DONE
//
while (!glfwWindowShouldClose(window))
{
//
// MONITOR FPS
//
pxl_glfw_fps(window);
//
// EXECUTE CUDA KERNEL ON RENDER BUFFER
//
int width,height;
cudaArray_t cuda_array;
pxl_interop_size_get(interop,&width,&height);
cuda_err = pxl_interop_map(interop,stream);
cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
width,
height,
event,
stream);
cuda_err = pxl_interop_unmap(interop,stream);
//
// BLIT & SWAP FBO
//
pxl_interop_blit(interop);
// pxl_interop_clear(interop);
pxl_interop_swap(interop);
//
// SWAP WINDOW
//
glfwSwapBuffers(window);
//
// PUMP/POLL/WAIT
//
glfwPollEvents(); // glfwWaitEvents();
}
//
// CLEANUP
//
pxl_interop_destroy(interop);
glfwDestroyWindow(window);
glfwTerminate();
cuda(DeviceReset());
// missing some clean up here
exit(EXIT_SUCCESS);
}
//
//
//
@allanmac
Copy link
Author

On Windows you can build with something like this:

nvcc -Xptxas=-v -o interop -I glad\output\include -I glfw\x64\include glad\output\src\glad.c main.c interop.c assert_cuda.c kernel.cu glfw\x64\lib-vc2013\glfw3dll.lib

The executable requires the glfw3.dll.

@allanmac
Copy link
Author

>interop 0
GL   : GeForce GTX 680          ( 8)
CUDA : GeForce GTX 750 Ti       ( 5)

screen capture

@munesh1407
Copy link

Hi
Just went through the code. I understand CUDA but not very proficient in OpenGL. Do you have some help files to initiate a beginner like me? Thanks.

@bipul-mohanto
Copy link

This sample code can only send buffer data from CUDA (any latest version) to OpenGL (the latest version is 4.6), right?

@allanmac
Copy link
Author

allanmac commented May 9, 2022

Yes, I think that's correct. I wrote this a very long time ago in order to understand CUDA>GL interop.

Today I would just use Vulkan.

@bipul-mohanto
Copy link

Thanks @allanmac for your reply. The same thing I am currently trying to understand. I am using OptiX (latest version 7.4) and need to send the rendered data to opengl buffer. There is no such thing called OptiX->OGL interoperability, but CUDA to OpenGL possible. I have run your code on my machine, it's working perfectly, and now I'm trying to understand what you actually did. Can you suggest some resources about CUDA->OpenGL interoperability?

@allanmac
Copy link
Author

allanmac commented May 9, 2022

If I remember correctly, there was no guide on this subject and I was mostly using the Runtime API docs and the GL interop example in the "CUDA Samples/" directory in the SDK.

It looks like the Samples/ are now on GitHub: https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/simpleGL

@bipul-mohanto
Copy link

IMHO, there has not been much changed on this topic. I think it is not something that many developers really need. Thank you very much for the resource.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment