Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A tiny example of CUDA + OpenGL interop with write-only surfaces and CUDA kernels. Uses GLFW+GLAD.
//
//
//
#include <stdlib.h>
#include <stdio.h>
//
//
//
#include "assert_cuda.h"
//
//
//
cudaError_t
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{
if (code != cudaSuccess)
{
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);
if (abort)
{
cudaDeviceReset();
exit(code);
}
}
return code;
}
//
//
//
//
//
//
#pragma once
//
//
//
#include <cuda_runtime.h>
#include <stdbool.h>
//
// Beware that NVCC doesn't work with C files and __VA_ARGS__
//
cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);
//
//
//
//
//
//
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cuda_gl_interop.h>
#include <stdlib.h>
//
//
//
#include "assert_cuda.h"
#include "interop.h"
//
//
//
struct pxl_interop
{
// split GPUs?
bool multi_gpu;
// number of fbo's
int count;
int index;
// w x h
int width;
int height;
// GL buffers
GLuint* fb;
GLuint* rb;
// CUDA resources
cudaGraphicsResource_t* cgr;
cudaArray_t* ca;
};
//
//
//
struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count)
{
struct pxl_interop* const interop = calloc(1,sizeof(*interop));
interop->multi_gpu = multi_gpu;
interop->count = fbo_count;
interop->index = 0;
// allocate arrays
interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));
// render buffer object w/a color buffer
glCreateRenderbuffers(fbo_count,interop->rb);
// frame buffer object
glCreateFramebuffers(fbo_count,interop->fb);
// attach rbo to fbo
for (int index=0; index<fbo_count; index++)
{
glNamedFramebufferRenderbuffer(interop->fb[index],
GL_COLOR_ATTACHMENT0,
GL_RENDERBUFFER,
interop->rb[index]);
}
// return it
return interop;
}
void
pxl_interop_destroy(struct pxl_interop* const interop)
{
cudaError_t cuda_err;
// unregister CUDA resources
for (int index=0; index<interop->count; index++)
{
if (interop->cgr[index] != NULL)
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
}
// delete rbo's
glDeleteRenderbuffers(interop->count,interop->rb);
// delete fbo's
glDeleteFramebuffers(interop->count,interop->fb);
// free buffers and resources
free(interop->fb);
free(interop->rb);
free(interop->cgr);
free(interop->ca);
// free interop
free(interop);
}
//
//
//
cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
{
cudaError_t cuda_err = cudaSuccess;
// save new size
interop->width = width;
interop->height = height;
// resize color buffer
for (int index=0; index<interop->count; index++)
{
// unregister resource
if (interop->cgr[index] != NULL)
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
// resize rbo
glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
// probe fbo status
// glCheckNamedFramebufferStatus(interop->fb[index],0);
// register rbo
cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
interop->rb[index],
GL_RENDERBUFFER,
cudaGraphicsRegisterFlagsSurfaceLoadStore |
cudaGraphicsRegisterFlagsWriteDiscard));
}
// map graphics resources
cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));
// get CUDA Array refernces
for (int index=0; index<interop->count; index++)
{
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
interop->cgr[index],
0,0));
}
// unmap graphics resources
cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));
return cuda_err;
}
void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
{
*width = interop->width;
*height = interop->height;
}
//
//
//
cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
{
if (!interop->multi_gpu)
return cudaSuccess;
// map graphics resources
return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
}
cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
{
if (!interop->multi_gpu)
return cudaSuccess;
return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
}
cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop)
{
//
// FIXME -- IS THIS EVEN NEEDED?
//
cudaError_t cuda_err;
// get a CUDA Array
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
interop->cgr[interop->index],
0,0));
return cuda_err;
}
//
//
//
cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop)
{
return interop->ca[interop->index];
}
int
pxl_interop_index_get(struct pxl_interop* const interop)
{
return interop->index;
}
//
//
//
void
pxl_interop_swap(struct pxl_interop* const interop)
{
interop->index = (interop->index + 1) % interop->count;
}
//
//
//
void
pxl_interop_clear(struct pxl_interop* const interop)
{
/*
static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
*/
const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
}
//
//
//
void
pxl_interop_blit(struct pxl_interop* const interop)
{
glBlitNamedFramebuffer(interop->fb[interop->index],0,
0,0, interop->width,interop->height,
0,interop->height,interop->width,0,
GL_COLOR_BUFFER_BIT,
GL_NEAREST);
}
//
//
//
//
//
//
#pragma once
//
//
//
#include <cuda_runtime.h>
#include <stdbool.h>
//
//
//
struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count);
void
pxl_interop_destroy(struct pxl_interop* const interop);
//
//
//
cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height);
void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height);
//
//
//
cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);
cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop);
//
//
//
cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop);
cudaStream_t
pxl_interop_stream_get(struct pxl_interop* const interop);
int
pxl_interop_index_get(struct pxl_interop* const interop);
//
//
//
void
pxl_interop_swap(struct pxl_interop* const interop);
void
pxl_interop_clear(struct pxl_interop* const interop);
void
pxl_interop_blit(struct pxl_interop* const interop);
//
//
//
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*-
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
#include "assert_cuda.h"
#ifdef __cplusplus
}
#endif
//
//
//
#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor
//
//
//
surface<void,cudaSurfaceType2D> surf;
//
//
//
union pxl_rgbx_24
{
uint1 b32;
struct {
unsigned r : 8;
unsigned g : 8;
unsigned b : 8;
unsigned na : 8;
};
};
//
//
//
extern "C"
__global__
void
pxl_kernel(const int width, const int height)
{
// pixel coordinates
const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
const int x = idx % width;
const int y = idx / width;
#if 1
// pixel color
const int t = (unsigned int)clock() / 1100000; // 1.1 GHz
const int xt = (idx + t) % width;
const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
const unsigned int bar = ((y + t) / 32) & 3;
union pxl_rgbx_24 rgbx;
rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0;
rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0;
rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0;
rgbx.na = 255;
#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };
#endif
surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
surf,
x*sizeof(rgbx),
y,
cudaBoundaryModeZero); // squelches out-of-bound writes
}
//
//
//
extern "C"
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
const int width,
const int height,
cudaEvent_t event,
cudaStream_t stream)
{
cudaError_t cuda_err;
// cuda_err = cudaEventRecord(event,stream);
cuda_err = cuda(BindSurfaceToArray(surf,array));
if (cuda_err)
return cuda_err;
const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
// cuda_err = cudaEventRecord(event,stream);
if (blocks > 0)
pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
// cuda_err = cudaStreamWaitEvent(stream,event,0);
return cudaSuccess;
}
//
//
//
//
//
//
#include <glad/glad.h>
#include <GLFW/glfw3.h>
//
//
//
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
//
//
//
#include <cuda_gl_interop.h>
//
//
//
#include "assert_cuda.h"
#include "interop.h"
//
// FPS COUNTER FROM HERE:
//
// http://antongerdelan.net/opengl/glcontext2.html
//
static
void
pxl_glfw_fps(GLFWwindow* window)
{
// static fps counters
static double stamp_prev = 0.0;
static int frame_count = 0;
// locals
const double stamp_curr = glfwGetTime();
const double elapsed = stamp_curr - stamp_prev;
if (elapsed > 0.5)
{
stamp_prev = stamp_curr;
const double fps = (double)frame_count / elapsed;
int width, height;
char tmp[64];
glfwGetFramebufferSize(window,&width,&height);
sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);
glfwSetWindowTitle(window,tmp);
frame_count = 0;
}
frame_count++;
}
//
//
//
static
void
pxl_glfw_error_callback(int error, const char* description)
{
fputs(description,stderr);
}
static
void
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
{
if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
glfwSetWindowShouldClose(window, GL_TRUE);
}
static
void
pxl_glfw_init(GLFWwindow** window, const int width, const int height)
{
//
// INITIALIZE GLFW/GLAD
//
glfwSetErrorCallback(pxl_glfw_error_callback);
if (!glfwInit())
exit(EXIT_FAILURE);
glfwWindowHint(GLFW_DEPTH_BITS, 0);
glfwWindowHint(GLFW_STENCIL_BITS, 0);
glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE);
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
#ifdef PXL_FULLSCREEN
GLFWmonitor* monitor = glfwGetPrimaryMonitor();
const GLFWvidmode* mode = glfwGetVideoMode(monitor);
*window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL);
#else
*window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
#endif
if (*window == NULL)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
glfwMakeContextCurrent(*window);
// set up GLAD
gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);
// ignore vsync for now
glfwSwapInterval(0);
// only copy r/g/b
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
// enable SRGB
// glEnable(GL_FRAMEBUFFER_SRGB);
}
//
//
//
static
void
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
{
// get context
struct pxl_interop* const interop = glfwGetWindowUserPointer(window);
pxl_interop_size_set(interop,width,height);
}
//
//
//
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
const int width,
const int height,
cudaEvent_t event,
cudaStream_t stream);
//
//
//
int
main(int argc, char* argv[])
{
//
// INIT GLFW
//
GLFWwindow* window;
pxl_glfw_init(&window,1024,1024);
//
// INIT CUDA
//
cudaError_t cuda_err;
int gl_device_id,gl_device_count;
cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));
int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
cuda_err = cuda(SetDevice(cuda_device_id));
//
// MULTI-GPU?
//
const bool multi_gpu = gl_device_id != cuda_device_id;
//
// INFO
//
struct cudaDeviceProp props;
cuda_err = cuda(GetDeviceProperties(&props,gl_device_id));
printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount);
cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id));
printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
//
// CREATE CUDA STREAM & EVENT
//
cudaStream_t stream;
cudaEvent_t event;
cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior
cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);
//
// CREATE INTEROP
//
// TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);
//
// RESIZE INTEROP
//
int width, height;
// get initial width/height
glfwGetFramebufferSize(window,&width,&height);
// resize with initial window dimensions
cuda_err = pxl_interop_size_set(interop,width,height);
//
// SET USER POINTER AND CALLBACKS
//
glfwSetWindowUserPointer (window,interop);
glfwSetKeyCallback (window,pxl_glfw_key_callback);
glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
//
// LOOP UNTIL DONE
//
while (!glfwWindowShouldClose(window))
{
//
// MONITOR FPS
//
pxl_glfw_fps(window);
//
// EXECUTE CUDA KERNEL ON RENDER BUFFER
//
int width,height;
cudaArray_t cuda_array;
pxl_interop_size_get(interop,&width,&height);
cuda_err = pxl_interop_map(interop,stream);
cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
width,
height,
event,
stream);
cuda_err = pxl_interop_unmap(interop,stream);
//
// BLIT & SWAP FBO
//
pxl_interop_blit(interop);
// pxl_interop_clear(interop);
pxl_interop_swap(interop);
//
// SWAP WINDOW
//
glfwSwapBuffers(window);
//
// PUMP/POLL/WAIT
//
glfwPollEvents(); // glfwWaitEvents();
}
//
// CLEANUP
//
pxl_interop_destroy(interop);
glfwDestroyWindow(window);
glfwTerminate();
cuda(DeviceReset());
// missing some clean up here
exit(EXIT_SUCCESS);
}
//
//
//
@allanmac

This comment has been minimized.

Copy link
Owner Author

@allanmac allanmac commented Apr 14, 2015

On Windows you can build with something like this:

nvcc -Xptxas=-v -o interop -I glad\output\include -I glfw\x64\include glad\output\src\glad.c main.c interop.c assert_cuda.c kernel.cu glfw\x64\lib-vc2013\glfw3dll.lib

The executable requires the glfw3.dll.

@allanmac

This comment has been minimized.

Copy link
Owner Author

@allanmac allanmac commented Apr 14, 2015

>interop 0
GL   : GeForce GTX 680          ( 8)
CUDA : GeForce GTX 750 Ti       ( 5)

screen capture

@munesh1407

This comment has been minimized.

Copy link

@munesh1407 munesh1407 commented Apr 23, 2017

Hi
Just went through the code. I understand CUDA but not very proficient in OpenGL. Do you have some help files to initiate a beginner like me? Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment