-
-
Save allanmac/4ff11985c3562830989f to your computer and use it in GitHub Desktop.
// | |
// | |
// | |
#include <stdlib.h> | |
#include <stdio.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
// | |
// | |
// | |
cudaError_t | |
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
if (abort) | |
{ | |
cudaDeviceReset(); | |
exit(code); | |
} | |
} | |
return code; | |
} | |
// | |
// | |
// |
// | |
// | |
// | |
#pragma once | |
// | |
// | |
// | |
#include <cuda_runtime.h> | |
#include <stdbool.h> | |
// | |
// Beware that NVCC doesn't work with C files and __VA_ARGS__ | |
// | |
cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort); | |
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); | |
// | |
// | |
// | |
// | |
// | |
// | |
#include <glad/glad.h> | |
#include <GLFW/glfw3.h> | |
#include <cuda_gl_interop.h> | |
#include <stdlib.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
#include "interop.h" | |
// | |
// | |
// | |
struct pxl_interop | |
{ | |
// split GPUs? | |
bool multi_gpu; | |
// number of fbo's | |
int count; | |
int index; | |
// w x h | |
int width; | |
int height; | |
// GL buffers | |
GLuint* fb; | |
GLuint* rb; | |
// CUDA resources | |
cudaGraphicsResource_t* cgr; | |
cudaArray_t* ca; | |
}; | |
// | |
// | |
// | |
struct pxl_interop* | |
pxl_interop_create(const bool multi_gpu, const int fbo_count) | |
{ | |
struct pxl_interop* const interop = calloc(1,sizeof(*interop)); | |
interop->multi_gpu = multi_gpu; | |
interop->count = fbo_count; | |
interop->index = 0; | |
// allocate arrays | |
interop->fb = calloc(fbo_count,sizeof(*(interop->fb ))); | |
interop->rb = calloc(fbo_count,sizeof(*(interop->rb ))); | |
interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr))); | |
interop->ca = calloc(fbo_count,sizeof(*(interop->ca))); | |
// render buffer object w/a color buffer | |
glCreateRenderbuffers(fbo_count,interop->rb); | |
// frame buffer object | |
glCreateFramebuffers(fbo_count,interop->fb); | |
// attach rbo to fbo | |
for (int index=0; index<fbo_count; index++) | |
{ | |
glNamedFramebufferRenderbuffer(interop->fb[index], | |
GL_COLOR_ATTACHMENT0, | |
GL_RENDERBUFFER, | |
interop->rb[index]); | |
} | |
// return it | |
return interop; | |
} | |
void | |
pxl_interop_destroy(struct pxl_interop* const interop) | |
{ | |
cudaError_t cuda_err; | |
// unregister CUDA resources | |
for (int index=0; index<interop->count; index++) | |
{ | |
if (interop->cgr[index] != NULL) | |
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
} | |
// delete rbo's | |
glDeleteRenderbuffers(interop->count,interop->rb); | |
// delete fbo's | |
glDeleteFramebuffers(interop->count,interop->fb); | |
// free buffers and resources | |
free(interop->fb); | |
free(interop->rb); | |
free(interop->cgr); | |
free(interop->ca); | |
// free interop | |
free(interop); | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height) | |
{ | |
cudaError_t cuda_err = cudaSuccess; | |
// save new size | |
interop->width = width; | |
interop->height = height; | |
// resize color buffer | |
for (int index=0; index<interop->count; index++) | |
{ | |
// unregister resource | |
if (interop->cgr[index] != NULL) | |
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
// resize rbo | |
glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height); | |
// probe fbo status | |
// glCheckNamedFramebufferStatus(interop->fb[index],0); | |
// register rbo | |
cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index], | |
interop->rb[index], | |
GL_RENDERBUFFER, | |
cudaGraphicsRegisterFlagsSurfaceLoadStore | | |
cudaGraphicsRegisterFlagsWriteDiscard)); | |
} | |
// map graphics resources | |
cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0)); | |
// get CUDA Array refernces | |
for (int index=0; index<interop->count; index++) | |
{ | |
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index], | |
interop->cgr[index], | |
0,0)); | |
} | |
// unmap graphics resources | |
cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0)); | |
return cuda_err; | |
} | |
void | |
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height) | |
{ | |
*width = interop->width; | |
*height = interop->height; | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream) | |
{ | |
if (!interop->multi_gpu) | |
return cudaSuccess; | |
// map graphics resources | |
return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream)); | |
} | |
cudaError_t | |
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream) | |
{ | |
if (!interop->multi_gpu) | |
return cudaSuccess; | |
return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream)); | |
} | |
cudaError_t | |
pxl_interop_array_map(struct pxl_interop* const interop) | |
{ | |
// | |
// FIXME -- IS THIS EVEN NEEDED? | |
// | |
cudaError_t cuda_err; | |
// get a CUDA Array | |
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index], | |
interop->cgr[interop->index], | |
0,0)); | |
return cuda_err; | |
} | |
// | |
// | |
// | |
cudaArray_const_t | |
pxl_interop_array_get(struct pxl_interop* const interop) | |
{ | |
return interop->ca[interop->index]; | |
} | |
int | |
pxl_interop_index_get(struct pxl_interop* const interop) | |
{ | |
return interop->index; | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_swap(struct pxl_interop* const interop) | |
{ | |
interop->index = (interop->index + 1) % interop->count; | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_clear(struct pxl_interop* const interop) | |
{ | |
/* | |
static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 }; | |
glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments); | |
*/ | |
const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color); | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_blit(struct pxl_interop* const interop) | |
{ | |
glBlitNamedFramebuffer(interop->fb[interop->index],0, | |
0,0, interop->width,interop->height, | |
0,interop->height,interop->width,0, | |
GL_COLOR_BUFFER_BIT, | |
GL_NEAREST); | |
} | |
// | |
// | |
// |
// | |
// | |
// | |
#pragma once | |
// | |
// | |
// | |
#include <cuda_runtime.h> | |
#include <stdbool.h> | |
// | |
// | |
// | |
struct pxl_interop* | |
pxl_interop_create(const bool multi_gpu, const int fbo_count); | |
void | |
pxl_interop_destroy(struct pxl_interop* const interop); | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height); | |
void | |
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height); | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream); | |
cudaError_t | |
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream); | |
cudaError_t | |
pxl_interop_array_map(struct pxl_interop* const interop); | |
// | |
// | |
// | |
cudaArray_const_t | |
pxl_interop_array_get(struct pxl_interop* const interop); | |
cudaStream_t | |
pxl_interop_stream_get(struct pxl_interop* const interop); | |
int | |
pxl_interop_index_get(struct pxl_interop* const interop); | |
// | |
// | |
// | |
void | |
pxl_interop_swap(struct pxl_interop* const interop); | |
void | |
pxl_interop_clear(struct pxl_interop* const interop); | |
void | |
pxl_interop_blit(struct pxl_interop* const interop); | |
// | |
// | |
// |
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*- | |
// | |
// | |
// | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
#include "assert_cuda.h" | |
#ifdef __cplusplus | |
} | |
#endif | |
// | |
// | |
// | |
#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor | |
// | |
// | |
// | |
surface<void,cudaSurfaceType2D> surf; | |
// | |
// | |
// | |
union pxl_rgbx_24 | |
{ | |
uint1 b32; | |
struct { | |
unsigned r : 8; | |
unsigned g : 8; | |
unsigned b : 8; | |
unsigned na : 8; | |
}; | |
}; | |
// | |
// | |
// | |
extern "C" | |
__global__ | |
void | |
pxl_kernel(const int width, const int height) | |
{ | |
// pixel coordinates | |
const int idx = (blockDim.x * blockIdx.x) + threadIdx.x; | |
const int x = idx % width; | |
const int y = idx / width; | |
#if 1 | |
// pixel color | |
const int t = (unsigned int)clock() / 1100000; // 1.1 GHz | |
const int xt = (idx + t) % width; | |
const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f); | |
const unsigned int bar = ((y + t) / 32) & 3; | |
union pxl_rgbx_24 rgbx; | |
rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0; | |
rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0; | |
rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0; | |
rgbx.na = 255; | |
#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT | |
const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1); | |
union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 }; | |
#endif | |
surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock() | |
surf, | |
x*sizeof(rgbx), | |
y, | |
cudaBoundaryModeZero); // squelches out-of-bound writes | |
} | |
// | |
// | |
// | |
extern "C" | |
cudaError_t | |
pxl_kernel_launcher(cudaArray_const_t array, | |
const int width, | |
const int height, | |
cudaEvent_t event, | |
cudaStream_t stream) | |
{ | |
cudaError_t cuda_err; | |
// cuda_err = cudaEventRecord(event,stream); | |
cuda_err = cuda(BindSurfaceToArray(surf,array)); | |
if (cuda_err) | |
return cuda_err; | |
const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK; | |
// cuda_err = cudaEventRecord(event,stream); | |
if (blocks > 0) | |
pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height); | |
// cuda_err = cudaStreamWaitEvent(stream,event,0); | |
return cudaSuccess; | |
} | |
// | |
// | |
// |
// | |
// | |
// | |
#include <glad/glad.h> | |
#include <GLFW/glfw3.h> | |
// | |
// | |
// | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdbool.h> | |
// | |
// | |
// | |
#include <cuda_gl_interop.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
#include "interop.h" | |
// | |
// FPS COUNTER FROM HERE: | |
// | |
// http://antongerdelan.net/opengl/glcontext2.html | |
// | |
static | |
void | |
pxl_glfw_fps(GLFWwindow* window) | |
{ | |
// static fps counters | |
static double stamp_prev = 0.0; | |
static int frame_count = 0; | |
// locals | |
const double stamp_curr = glfwGetTime(); | |
const double elapsed = stamp_curr - stamp_prev; | |
if (elapsed > 0.5) | |
{ | |
stamp_prev = stamp_curr; | |
const double fps = (double)frame_count / elapsed; | |
int width, height; | |
char tmp[64]; | |
glfwGetFramebufferSize(window,&width,&height); | |
sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps); | |
glfwSetWindowTitle(window,tmp); | |
frame_count = 0; | |
} | |
frame_count++; | |
} | |
// | |
// | |
// | |
static | |
void | |
pxl_glfw_error_callback(int error, const char* description) | |
{ | |
fputs(description,stderr); | |
} | |
static | |
void | |
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods) | |
{ | |
if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) | |
glfwSetWindowShouldClose(window, GL_TRUE); | |
} | |
static | |
void | |
pxl_glfw_init(GLFWwindow** window, const int width, const int height) | |
{ | |
// | |
// INITIALIZE GLFW/GLAD | |
// | |
glfwSetErrorCallback(pxl_glfw_error_callback); | |
if (!glfwInit()) | |
exit(EXIT_FAILURE); | |
glfwWindowHint(GLFW_DEPTH_BITS, 0); | |
glfwWindowHint(GLFW_STENCIL_BITS, 0); | |
glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE); | |
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); | |
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5); | |
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); | |
#ifdef PXL_FULLSCREEN | |
GLFWmonitor* monitor = glfwGetPrimaryMonitor(); | |
const GLFWvidmode* mode = glfwGetVideoMode(monitor); | |
*window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL); | |
#else | |
*window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL); | |
#endif | |
if (*window == NULL) | |
{ | |
glfwTerminate(); | |
exit(EXIT_FAILURE); | |
} | |
glfwMakeContextCurrent(*window); | |
// set up GLAD | |
gladLoadGLLoader((GLADloadproc)glfwGetProcAddress); | |
// ignore vsync for now | |
glfwSwapInterval(0); | |
// only copy r/g/b | |
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); | |
// enable SRGB | |
// glEnable(GL_FRAMEBUFFER_SRGB); | |
} | |
// | |
// | |
// | |
static | |
void | |
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height) | |
{ | |
// get context | |
struct pxl_interop* const interop = glfwGetWindowUserPointer(window); | |
pxl_interop_size_set(interop,width,height); | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_kernel_launcher(cudaArray_const_t array, | |
const int width, | |
const int height, | |
cudaEvent_t event, | |
cudaStream_t stream); | |
// | |
// | |
// | |
int | |
main(int argc, char* argv[]) | |
{ | |
// | |
// INIT GLFW | |
// | |
GLFWwindow* window; | |
pxl_glfw_init(&window,1024,1024); | |
// | |
// INIT CUDA | |
// | |
cudaError_t cuda_err; | |
int gl_device_id,gl_device_count; | |
cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll)); | |
int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id; | |
cuda_err = cuda(SetDevice(cuda_device_id)); | |
// | |
// MULTI-GPU? | |
// | |
const bool multi_gpu = gl_device_id != cuda_device_id; | |
// | |
// INFO | |
// | |
struct cudaDeviceProp props; | |
cuda_err = cuda(GetDeviceProperties(&props,gl_device_id)); | |
printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id)); | |
printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
// | |
// CREATE CUDA STREAM & EVENT | |
// | |
cudaStream_t stream; | |
cudaEvent_t event; | |
cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior | |
cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming); | |
// | |
// CREATE INTEROP | |
// | |
// TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE | |
struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); | |
// | |
// RESIZE INTEROP | |
// | |
int width, height; | |
// get initial width/height | |
glfwGetFramebufferSize(window,&width,&height); | |
// resize with initial window dimensions | |
cuda_err = pxl_interop_size_set(interop,width,height); | |
// | |
// SET USER POINTER AND CALLBACKS | |
// | |
glfwSetWindowUserPointer (window,interop); | |
glfwSetKeyCallback (window,pxl_glfw_key_callback); | |
glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback); | |
// | |
// LOOP UNTIL DONE | |
// | |
while (!glfwWindowShouldClose(window)) | |
{ | |
// | |
// MONITOR FPS | |
// | |
pxl_glfw_fps(window); | |
// | |
// EXECUTE CUDA KERNEL ON RENDER BUFFER | |
// | |
int width,height; | |
cudaArray_t cuda_array; | |
pxl_interop_size_get(interop,&width,&height); | |
cuda_err = pxl_interop_map(interop,stream); | |
cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop), | |
width, | |
height, | |
event, | |
stream); | |
cuda_err = pxl_interop_unmap(interop,stream); | |
// | |
// BLIT & SWAP FBO | |
// | |
pxl_interop_blit(interop); | |
// pxl_interop_clear(interop); | |
pxl_interop_swap(interop); | |
// | |
// SWAP WINDOW | |
// | |
glfwSwapBuffers(window); | |
// | |
// PUMP/POLL/WAIT | |
// | |
glfwPollEvents(); // glfwWaitEvents(); | |
} | |
// | |
// CLEANUP | |
// | |
pxl_interop_destroy(interop); | |
glfwDestroyWindow(window); | |
glfwTerminate(); | |
cuda(DeviceReset()); | |
// missing some clean up here | |
exit(EXIT_SUCCESS); | |
} | |
// | |
// | |
// |
Hi
Just went through the code. I understand CUDA but not very proficient in OpenGL. Do you have some help files to initiate a beginner like me? Thanks.
This sample code can only send buffer data from CUDA (any latest version) to OpenGL (the latest version is 4.6), right?
Yes, I think that's correct. I wrote this a very long time ago in order to understand CUDA>GL interop.
Today I would just use Vulkan.
Thanks @allanmac for your reply. The same thing I am currently trying to understand. I am using OptiX (latest version 7.4) and need to send the rendered data to opengl buffer. There is no such thing called OptiX->OGL interoperability, but CUDA to OpenGL possible. I have run your code on my machine, it's working perfectly, and now I'm trying to understand what you actually did. Can you suggest some resources about CUDA->OpenGL
interoperability?
If I remember correctly, there was no guide on this subject and I was mostly using the Runtime API docs and the GL interop example in the "CUDA Samples/" directory in the SDK.
It looks like the Samples/ are now on GitHub: https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/simpleGL
IMHO, there has not been much changed on this topic. I think it is not something that many developers really need. Thank you very much for the resource.
On Windows you can build with something like this:
nvcc -Xptxas=-v -o interop -I glad\output\include -I glfw\x64\include glad\output\src\glad.c main.c interop.c assert_cuda.c kernel.cu glfw\x64\lib-vc2013\glfw3dll.lib
The executable requires the glfw3.dll.