Skip to content

Instantly share code, notes, and snippets.

@eruffaldi
Forked from allanmac/assert_cuda.c
Last active May 27, 2016 04:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save eruffaldi/f2cf1eee58100f092ec3 to your computer and use it in GitHub Desktop.
Save eruffaldi/f2cf1eee58100f092ec3 to your computer and use it in GitHub Desktop.
A tiny example of CUDA + OpenGL interop with write-only surfaces and CUDA kernels. Uses GLFW+GLAD.
//
//
//
#include <stdlib.h>
#include <stdio.h>
//
//
//
#include "assert_cuda.h"
//
//
//
cudaError_t
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{
if (code != cudaSuccess)
{
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);
if (abort)
{
cudaDeviceReset();
exit(code);
}
}
return code;
}
//
//
//
//
//
//
#pragma once
//
//
//
#include <cuda_runtime.h>
#include <stdbool.h>
//
// Beware that NVCC doesn't work with C files and __VA_ARGS__
//
cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);
//
//
//
#sudo pip install --upgrade git+https://github.com/dav1dde/glad.git#egg=glad
#python main.py --api gl=3.3 --generator=c --out-path=GL
find_package(CUDA)
find_package(GLFW)
include_directories(${GLFW_INCLUDE_DIRS})
include_directories(GL/include)
cuda_add_executable(main main.c assert_cuda.c interop.c GL/src/glad.c kernel.cu)
target_link_libraries(main ${GLFW_LIBRARY})
//
//
//
#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cuda_gl_interop.h>
#include <stdlib.h>
#include <stdio.h>
//
//
//
#include "assert_cuda.h"
#include "interop.h"
//
//
//
struct pxl_interop
{
// split GPUs?
bool multi_gpu;
// number of fbo's
int count;
int index;
// w x h
int width;
int height;
// GL buffers
GLuint* fb;
GLuint* rb;
// CUDA resources
cudaGraphicsResource_t* cgr;
cudaArray_t* ca;
};
//
//
//
struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count)
{
struct pxl_interop* const interop = calloc(1,sizeof(*interop));
interop->multi_gpu = multi_gpu;
interop->count = fbo_count;
interop->index = 0;
// allocate arrays
interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));
fprintf(stderr,"creating buffers\n");
#ifdef Opengl4_5
// render buffer object w/a color buffer
glCreateRenderbuffers(fbo_count,interop->rb);
// frame buffer object
glCreateFramebuffers(fbo_count,interop->fb);
#else
glGenRenderbuffers(fbo_count,interop->rb);
glGenFramebuffers(fbo_count,interop->fb);
#endif
fprintf(stderr,"created buffers\n");
// attach rbo to fbo
for (int index=0; index<fbo_count; index++)
{
#ifdef Opengl4_5
glNamedFramebufferRenderbuffer(interop->fb[index],
GL_COLOR_ATTACHMENT0,
GL_RENDERBUFFER,
interop->rb[index]);
#else
glBindFramebuffer(GL_FRAMEBUFFER,interop->fb[index]);
glFramebufferRenderbuffer(GL_FRAMEBUFFER,
GL_COLOR_ATTACHMENT0,
GL_RENDERBUFFER,
interop->rb[index]);
glBindFramebuffer(GL_FRAMEBUFFER,0);
#endif
}
fprintf(stderr,"bound buffers\n");
// return it
return interop;
}
void
pxl_interop_destroy(struct pxl_interop* const interop)
{
cudaError_t cuda_err;
// unregister CUDA resources
for (int index=0; index<interop->count; index++)
{
if (interop->cgr[index] != NULL)
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
}
// delete rbo's
glDeleteRenderbuffers(interop->count,interop->rb);
// delete fbo's
glDeleteFramebuffers(interop->count,interop->fb);
// free buffers and resources
free(interop->fb);
free(interop->rb);
free(interop->cgr);
free(interop->ca);
// free interop
free(interop);
}
//
//
//
cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
{
cudaError_t cuda_err = cudaSuccess;
// save new size
interop->width = width;
interop->height = height;
// resize color buffer
for (int index=0; index<interop->count; index++)
{
// unregister resource
if (interop->cgr[index] != NULL)
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
#ifdef Opengl4_5
// resize rbo
glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
#else
glBindRenderbuffer(GL_RENDERBUFFER,interop->rb[index]);
glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);
glBindRenderbuffer(GL_RENDERBUFFER,0);
#endif
// probe fbo status
// glCheckNamedFramebufferStatus(interop->fb[index],0);
// register rbo
cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
interop->rb[index],
GL_RENDERBUFFER,
cudaGraphicsRegisterFlagsSurfaceLoadStore |
cudaGraphicsRegisterFlagsWriteDiscard));
}
// map graphics resources
cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));
// get CUDA Array refernces
for (int index=0; index<interop->count; index++)
{
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
interop->cgr[index],
0,0));
}
// unmap graphics resources
cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));
return cuda_err;
}
void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
{
*width = interop->width;
*height = interop->height;
}
//
//
//
cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
{
if (!interop->multi_gpu)
return cudaSuccess;
// map graphics resources
return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
}
cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
{
if (!interop->multi_gpu)
return cudaSuccess;
return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
}
cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop)
{
//
// FIXME -- IS THIS EVEN NEEDED?
//
cudaError_t cuda_err;
// get a CUDA Array
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
interop->cgr[interop->index],
0,0));
return cuda_err;
}
//
//
//
cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop)
{
return interop->ca[interop->index];
}
int
pxl_interop_index_get(struct pxl_interop* const interop)
{
return interop->index;
}
//
//
//
void
pxl_interop_swap(struct pxl_interop* const interop)
{
interop->index = (interop->index + 1) % interop->count;
}
//
//
//
void
pxl_interop_clear(struct pxl_interop* const interop)
{
/*
static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
*/
const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
#ifdef Opengl4_5
glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
#else
glBindFramebuffer(GL_FRAMEBUFFER,interop->fb[interop->index]);
glClearBufferfv(GL_COLOR,0,clear_color);
glBindFramebuffer(GL_FRAMEBUFFER,0);
#endif
}
//
//
//
void
pxl_interop_blit(struct pxl_interop* const interop)
{
#ifdef Opengl4_5
glBlitNamedFramebuffer(interop->fb[interop->index],0,
0,0, interop->width,interop->height,
0,interop->height,interop->width,0,
GL_COLOR_BUFFER_BIT,
GL_NEAREST);
#else
glBindFramebuffer(GL_READ_FRAMEBUFFER,interop->fb[interop->index]);
glBlitFramebuffer(
0,0, interop->width,interop->height,
0,interop->height,interop->width,0,
GL_COLOR_BUFFER_BIT,
GL_NEAREST);
glBindFramebuffer(GL_READ_FRAMEBUFFER,0);
#endif
}
//
//
//
//
//
//
#pragma once
//
//
//
#include <cuda_runtime.h>
#include <stdbool.h>
//
//
//
struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count);
void
pxl_interop_destroy(struct pxl_interop* const interop);
//
//
//
cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height);
void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height);
//
//
//
cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);
cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop);
//
//
//
cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop);
cudaStream_t
pxl_interop_stream_get(struct pxl_interop* const interop);
int
pxl_interop_index_get(struct pxl_interop* const interop);
//
//
//
void
pxl_interop_swap(struct pxl_interop* const interop);
void
pxl_interop_clear(struct pxl_interop* const interop);
void
pxl_interop_blit(struct pxl_interop* const interop);
//
//
//
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*-
//
//
//
#ifdef __cplusplus
extern "C" {
#endif
#include "assert_cuda.h"
#ifdef __cplusplus
}
#endif
//
//
//
#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor
//
//
//
surface<void,cudaSurfaceType2D> surf;
//
//
//
union pxl_rgbx_24
{
uint1 b32;
struct {
unsigned r : 8;
unsigned g : 8;
unsigned b : 8;
unsigned na : 8;
};
};
//
//
//
extern "C"
__global__
void
pxl_kernel(const int width, const int height)
{
// pixel coordinates
const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
const int x = idx % width;
const int y = idx / width;
#if 1
// pixel color
const int t = (unsigned int)clock() / 1100000; // 1.1 GHz
const int xt = (idx + t) % width;
const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
const unsigned int bar = ((y + t) / 32) & 3;
union pxl_rgbx_24 rgbx;
rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0;
rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0;
rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0;
rgbx.na = 255;
#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };
#endif
surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
surf,
x*sizeof(rgbx),
y,
cudaBoundaryModeZero); // squelches out-of-bound writes
}
//
//
//
extern "C"
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
const int width,
const int height,
cudaEvent_t event,
cudaStream_t stream)
{
cudaError_t cuda_err;
// cuda_err = cudaEventRecord(event,stream);
cuda_err = cuda(BindSurfaceToArray(surf,array));
if (cuda_err)
return cuda_err;
const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
// cuda_err = cudaEventRecord(event,stream);
if (blocks > 0)
pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
// cuda_err = cudaStreamWaitEvent(stream,event,0);
return cudaSuccess;
}
//
//
//
//
//
//
#include <glad/glad.h>
#include <GLFW/glfw3.h>
//
//
//
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
//
//
//
#include <cuda_gl_interop.h>
//
//
//
#include "assert_cuda.h"
#include "interop.h"
//
// FPS COUNTER FROM HERE:
//
// http://antongerdelan.net/opengl/glcontext2.html
//
static
void
pxl_glfw_fps(GLFWwindow* window)
{
// static fps counters
static double stamp_prev = 0.0;
static int frame_count = 0;
// locals
const double stamp_curr = glfwGetTime();
const double elapsed = stamp_curr - stamp_prev;
if (elapsed > 0.5)
{
stamp_prev = stamp_curr;
const double fps = (double)frame_count / elapsed;
int width, height;
char tmp[64];
glfwGetFramebufferSize(window,&width,&height);
sprintf(tmp,"(%u x %u) - FPS: %.2f",width,height,fps);
glfwSetWindowTitle(window,tmp);
frame_count = 0;
}
frame_count++;
}
//
//
//
static
void
pxl_glfw_error_callback(int error, const char* description)
{
fputs(description,stderr);
}
static
void
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
{
if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
glfwSetWindowShouldClose(window, GL_TRUE);
}
static
void
pxl_glfw_init(GLFWwindow** window, const int width, const int height)
{
//
// INITIALIZE GLFW/GLAD
//
glfwSetErrorCallback(pxl_glfw_error_callback);
if (!glfwInit())
exit(EXIT_FAILURE);
glfwWindowHint(GLFW_DEPTH_BITS, 0);
glfwWindowHint(GLFW_STENCIL_BITS, 0);
#ifdef Opengl_45
glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE);
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
#else
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
#endif
#ifdef PXL_FULLSCREEN
GLFWmonitor* monitor = glfwGetPrimaryMonitor();
const GLFWvidmode* mode = glfwGetVideoMode(monitor);
*window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL);
#else
*window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
#endif
if (*window == NULL)
{
glfwTerminate();
exit(EXIT_FAILURE);
}
glfwMakeContextCurrent(*window);
// set up GLAD
gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);
// ignore vsync for now
glfwSwapInterval(0);
// only copy r/g/b
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
// enable SRGB
// glEnable(GL_FRAMEBUFFER_SRGB);
}
//
//
//
static
void
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
{
// get context
struct pxl_interop* const interop = glfwGetWindowUserPointer(window);
pxl_interop_size_set(interop,width,height);
}
//
//
//
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
const int width,
const int height,
cudaEvent_t event,
cudaStream_t stream);
//
//
//
int
main(int argc, char* argv[])
{
//
// INIT GLFW
//
GLFWwindow* window;
pxl_glfw_init(&window,1024,1024);
//
// INIT CUDA
//
cudaError_t cuda_err;
int gl_device_id=0,gl_device_count=0,cuda_device_id=0;
#ifdef Opengl_45
cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));
cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
cuda_err = cuda(SetDevice(cuda_device_id));
#else
cudaGLSetGLDevice(0);
#endif
//
// MULTI-GPU?
//
const bool multi_gpu = gl_device_id != cuda_device_id;
//
// INFO
//
struct cudaDeviceProp props;
cuda_err = cuda(GetDeviceProperties(&props,gl_device_id));
printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount);
cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id));
printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
//
// CREATE CUDA STREAM & EVENT
//
cudaStream_t stream;
cudaEvent_t event;
fprintf(stderr,"before create\n");
cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior
cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);
//
// CREATE INTEROP
//
// TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
fprintf(stderr,"pxl_interop_create\n");
struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);
fprintf(stderr,"after pxl_interop_create\n");
//
// RESIZE INTEROP
//
int width, height;
// get initial width/height
glfwGetFramebufferSize(window,&width,&height);
// resize with initial window dimensions
cuda_err = pxl_interop_size_set(interop,width,height);
//
// SET USER POINTER AND CALLBACKS
//
glfwSetWindowUserPointer (window,interop);
glfwSetKeyCallback (window,pxl_glfw_key_callback);
glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
fprintf(stderr,"loop\n");
//
// LOOP UNTIL DONE
//
while (!glfwWindowShouldClose(window))
{
//
// MONITOR FPS
//
pxl_glfw_fps(window);
//
// EXECUTE CUDA KERNEL ON RENDER BUFFER
//
int width,height;
cudaArray_t cuda_array;
pxl_interop_size_get(interop,&width,&height);
cuda_err = pxl_interop_map(interop,stream);
cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
width,
height,
event,
stream);
cuda_err = pxl_interop_unmap(interop,stream);
//
// BLIT & SWAP FBO
//
pxl_interop_blit(interop);
// pxl_interop_clear(interop);
pxl_interop_swap(interop);
//
// SWAP WINDOW
//
glfwSwapBuffers(window);
//
// PUMP/POLL/WAIT
//
glfwPollEvents(); // glfwWaitEvents();
}
//
// CLEANUP
//
pxl_interop_destroy(interop);
glfwDestroyWindow(window);
glfwTerminate();
cuda(DeviceReset());
// missing some clean up here
exit(EXIT_SUCCESS);
}
//
//
//
@eruffaldi
Copy link
Author

Forked original for backporting to OpenGL 3.3, building with CMake, instructions for using GLAD.

Tested under OSX 10.10 with CUDA 7.5

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment