-
-
Save allanmac/4ff11985c3562830989f to your computer and use it in GitHub Desktop.
// | |
// | |
// | |
#include <stdlib.h> | |
#include <stdio.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
// | |
// | |
// | |
cudaError_t | |
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line); | |
if (abort) | |
{ | |
cudaDeviceReset(); | |
exit(code); | |
} | |
} | |
return code; | |
} | |
// | |
// | |
// |
// | |
// | |
// | |
#pragma once | |
// | |
// | |
// | |
#include <cuda_runtime.h> | |
#include <stdbool.h> | |
// | |
// Beware that NVCC doesn't work with C files and __VA_ARGS__ | |
// | |
cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort); | |
#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true); | |
// | |
// | |
// |
// | |
// | |
// | |
#include <glad/glad.h> | |
#include <GLFW/glfw3.h> | |
#include <cuda_gl_interop.h> | |
#include <stdlib.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
#include "interop.h" | |
// | |
// | |
// | |
struct pxl_interop | |
{ | |
// split GPUs? | |
bool multi_gpu; | |
// number of fbo's | |
int count; | |
int index; | |
// w x h | |
int width; | |
int height; | |
// GL buffers | |
GLuint* fb; | |
GLuint* rb; | |
// CUDA resources | |
cudaGraphicsResource_t* cgr; | |
cudaArray_t* ca; | |
}; | |
// | |
// | |
// | |
struct pxl_interop* | |
pxl_interop_create(const bool multi_gpu, const int fbo_count) | |
{ | |
struct pxl_interop* const interop = calloc(1,sizeof(*interop)); | |
interop->multi_gpu = multi_gpu; | |
interop->count = fbo_count; | |
interop->index = 0; | |
// allocate arrays | |
interop->fb = calloc(fbo_count,sizeof(*(interop->fb ))); | |
interop->rb = calloc(fbo_count,sizeof(*(interop->rb ))); | |
interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr))); | |
interop->ca = calloc(fbo_count,sizeof(*(interop->ca))); | |
// render buffer object w/a color buffer | |
glCreateRenderbuffers(fbo_count,interop->rb); | |
// frame buffer object | |
glCreateFramebuffers(fbo_count,interop->fb); | |
// attach rbo to fbo | |
for (int index=0; index<fbo_count; index++) | |
{ | |
glNamedFramebufferRenderbuffer(interop->fb[index], | |
GL_COLOR_ATTACHMENT0, | |
GL_RENDERBUFFER, | |
interop->rb[index]); | |
} | |
// return it | |
return interop; | |
} | |
void | |
pxl_interop_destroy(struct pxl_interop* const interop) | |
{ | |
cudaError_t cuda_err; | |
// unregister CUDA resources | |
for (int index=0; index<interop->count; index++) | |
{ | |
if (interop->cgr[index] != NULL) | |
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
} | |
// delete rbo's | |
glDeleteRenderbuffers(interop->count,interop->rb); | |
// delete fbo's | |
glDeleteFramebuffers(interop->count,interop->fb); | |
// free buffers and resources | |
free(interop->fb); | |
free(interop->rb); | |
free(interop->cgr); | |
free(interop->ca); | |
// free interop | |
free(interop); | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height) | |
{ | |
cudaError_t cuda_err = cudaSuccess; | |
// save new size | |
interop->width = width; | |
interop->height = height; | |
// resize color buffer | |
for (int index=0; index<interop->count; index++) | |
{ | |
// unregister resource | |
if (interop->cgr[index] != NULL) | |
cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index])); | |
// resize rbo | |
glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height); | |
// probe fbo status | |
// glCheckNamedFramebufferStatus(interop->fb[index],0); | |
// register rbo | |
cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index], | |
interop->rb[index], | |
GL_RENDERBUFFER, | |
cudaGraphicsRegisterFlagsSurfaceLoadStore | | |
cudaGraphicsRegisterFlagsWriteDiscard)); | |
} | |
// map graphics resources | |
cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0)); | |
// get CUDA Array refernces | |
for (int index=0; index<interop->count; index++) | |
{ | |
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index], | |
interop->cgr[index], | |
0,0)); | |
} | |
// unmap graphics resources | |
cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0)); | |
return cuda_err; | |
} | |
void | |
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height) | |
{ | |
*width = interop->width; | |
*height = interop->height; | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream) | |
{ | |
if (!interop->multi_gpu) | |
return cudaSuccess; | |
// map graphics resources | |
return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream)); | |
} | |
cudaError_t | |
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream) | |
{ | |
if (!interop->multi_gpu) | |
return cudaSuccess; | |
return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream)); | |
} | |
cudaError_t | |
pxl_interop_array_map(struct pxl_interop* const interop) | |
{ | |
// | |
// FIXME -- IS THIS EVEN NEEDED? | |
// | |
cudaError_t cuda_err; | |
// get a CUDA Array | |
cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index], | |
interop->cgr[interop->index], | |
0,0)); | |
return cuda_err; | |
} | |
// | |
// | |
// | |
cudaArray_const_t | |
pxl_interop_array_get(struct pxl_interop* const interop) | |
{ | |
return interop->ca[interop->index]; | |
} | |
int | |
pxl_interop_index_get(struct pxl_interop* const interop) | |
{ | |
return interop->index; | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_swap(struct pxl_interop* const interop) | |
{ | |
interop->index = (interop->index + 1) % interop->count; | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_clear(struct pxl_interop* const interop) | |
{ | |
/* | |
static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 }; | |
glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments); | |
*/ | |
const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f }; | |
glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color); | |
} | |
// | |
// | |
// | |
void | |
pxl_interop_blit(struct pxl_interop* const interop) | |
{ | |
glBlitNamedFramebuffer(interop->fb[interop->index],0, | |
0,0, interop->width,interop->height, | |
0,interop->height,interop->width,0, | |
GL_COLOR_BUFFER_BIT, | |
GL_NEAREST); | |
} | |
// | |
// | |
// |
// | |
// | |
// | |
#pragma once | |
// | |
// | |
// | |
#include <cuda_runtime.h> | |
#include <stdbool.h> | |
// | |
// | |
// | |
struct pxl_interop* | |
pxl_interop_create(const bool multi_gpu, const int fbo_count); | |
void | |
pxl_interop_destroy(struct pxl_interop* const interop); | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height); | |
void | |
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height); | |
// | |
// | |
// | |
cudaError_t | |
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream); | |
cudaError_t | |
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream); | |
cudaError_t | |
pxl_interop_array_map(struct pxl_interop* const interop); | |
// | |
// | |
// | |
cudaArray_const_t | |
pxl_interop_array_get(struct pxl_interop* const interop); | |
cudaStream_t | |
pxl_interop_stream_get(struct pxl_interop* const interop); | |
int | |
pxl_interop_index_get(struct pxl_interop* const interop); | |
// | |
// | |
// | |
void | |
pxl_interop_swap(struct pxl_interop* const interop); | |
void | |
pxl_interop_clear(struct pxl_interop* const interop); | |
void | |
pxl_interop_blit(struct pxl_interop* const interop); | |
// | |
// | |
// |
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*- | |
// | |
// | |
// | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
#include "assert_cuda.h" | |
#ifdef __cplusplus | |
} | |
#endif | |
// | |
// | |
// | |
#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor | |
// | |
// | |
// | |
surface<void,cudaSurfaceType2D> surf; | |
// | |
// | |
// | |
union pxl_rgbx_24 | |
{ | |
uint1 b32; | |
struct { | |
unsigned r : 8; | |
unsigned g : 8; | |
unsigned b : 8; | |
unsigned na : 8; | |
}; | |
}; | |
// | |
// | |
// | |
extern "C" | |
__global__ | |
void | |
pxl_kernel(const int width, const int height) | |
{ | |
// pixel coordinates | |
const int idx = (blockDim.x * blockIdx.x) + threadIdx.x; | |
const int x = idx % width; | |
const int y = idx / width; | |
#if 1 | |
// pixel color | |
const int t = (unsigned int)clock() / 1100000; // 1.1 GHz | |
const int xt = (idx + t) % width; | |
const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f); | |
const unsigned int bar = ((y + t) / 32) & 3; | |
union pxl_rgbx_24 rgbx; | |
rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0; | |
rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0; | |
rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0; | |
rgbx.na = 255; | |
#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT | |
const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1); | |
union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 }; | |
#endif | |
surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock() | |
surf, | |
x*sizeof(rgbx), | |
y, | |
cudaBoundaryModeZero); // squelches out-of-bound writes | |
} | |
// | |
// | |
// | |
extern "C" | |
cudaError_t | |
pxl_kernel_launcher(cudaArray_const_t array, | |
const int width, | |
const int height, | |
cudaEvent_t event, | |
cudaStream_t stream) | |
{ | |
cudaError_t cuda_err; | |
// cuda_err = cudaEventRecord(event,stream); | |
cuda_err = cuda(BindSurfaceToArray(surf,array)); | |
if (cuda_err) | |
return cuda_err; | |
const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK; | |
// cuda_err = cudaEventRecord(event,stream); | |
if (blocks > 0) | |
pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height); | |
// cuda_err = cudaStreamWaitEvent(stream,event,0); | |
return cudaSuccess; | |
} | |
// | |
// | |
// |
// | |
// | |
// | |
#include <glad/glad.h> | |
#include <GLFW/glfw3.h> | |
// | |
// | |
// | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <stdbool.h> | |
// | |
// | |
// | |
#include <cuda_gl_interop.h> | |
// | |
// | |
// | |
#include "assert_cuda.h" | |
#include "interop.h" | |
// | |
// FPS COUNTER FROM HERE: | |
// | |
// http://antongerdelan.net/opengl/glcontext2.html | |
// | |
static | |
void | |
pxl_glfw_fps(GLFWwindow* window) | |
{ | |
// static fps counters | |
static double stamp_prev = 0.0; | |
static int frame_count = 0; | |
// locals | |
const double stamp_curr = glfwGetTime(); | |
const double elapsed = stamp_curr - stamp_prev; | |
if (elapsed > 0.5) | |
{ | |
stamp_prev = stamp_curr; | |
const double fps = (double)frame_count / elapsed; | |
int width, height; | |
char tmp[64]; | |
glfwGetFramebufferSize(window,&width,&height); | |
sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps); | |
glfwSetWindowTitle(window,tmp); | |
frame_count = 0; | |
} | |
frame_count++; | |
} | |
// | |
// | |
// | |
static | |
void | |
pxl_glfw_error_callback(int error, const char* description) | |
{ | |
fputs(description,stderr); | |
} | |
static | |
void | |
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods) | |
{ | |
if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) | |
glfwSetWindowShouldClose(window, GL_TRUE); | |
} | |
static | |
void | |
pxl_glfw_init(GLFWwindow** window, const int width, const int height) | |
{ | |
// | |
// INITIALIZE GLFW/GLAD | |
// | |
glfwSetErrorCallback(pxl_glfw_error_callback); | |
if (!glfwInit()) | |
exit(EXIT_FAILURE); | |
glfwWindowHint(GLFW_DEPTH_BITS, 0); | |
glfwWindowHint(GLFW_STENCIL_BITS, 0); | |
glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE); | |
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); | |
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5); | |
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); | |
#ifdef PXL_FULLSCREEN | |
GLFWmonitor* monitor = glfwGetPrimaryMonitor(); | |
const GLFWvidmode* mode = glfwGetVideoMode(monitor); | |
*window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL); | |
#else | |
*window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL); | |
#endif | |
if (*window == NULL) | |
{ | |
glfwTerminate(); | |
exit(EXIT_FAILURE); | |
} | |
glfwMakeContextCurrent(*window); | |
// set up GLAD | |
gladLoadGLLoader((GLADloadproc)glfwGetProcAddress); | |
// ignore vsync for now | |
glfwSwapInterval(0); | |
// only copy r/g/b | |
glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE); | |
// enable SRGB | |
// glEnable(GL_FRAMEBUFFER_SRGB); | |
} | |
// | |
// | |
// | |
static | |
void | |
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height) | |
{ | |
// get context | |
struct pxl_interop* const interop = glfwGetWindowUserPointer(window); | |
pxl_interop_size_set(interop,width,height); | |
} | |
// | |
// | |
// | |
cudaError_t | |
pxl_kernel_launcher(cudaArray_const_t array, | |
const int width, | |
const int height, | |
cudaEvent_t event, | |
cudaStream_t stream); | |
// | |
// | |
// | |
int | |
main(int argc, char* argv[]) | |
{ | |
// | |
// INIT GLFW | |
// | |
GLFWwindow* window; | |
pxl_glfw_init(&window,1024,1024); | |
// | |
// INIT CUDA | |
// | |
cudaError_t cuda_err; | |
int gl_device_id,gl_device_count; | |
cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll)); | |
int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id; | |
cuda_err = cuda(SetDevice(cuda_device_id)); | |
// | |
// MULTI-GPU? | |
// | |
const bool multi_gpu = gl_device_id != cuda_device_id; | |
// | |
// INFO | |
// | |
struct cudaDeviceProp props; | |
cuda_err = cuda(GetDeviceProperties(&props,gl_device_id)); | |
printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id)); | |
printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount); | |
// | |
// CREATE CUDA STREAM & EVENT | |
// | |
cudaStream_t stream; | |
cudaEvent_t event; | |
cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior | |
cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming); | |
// | |
// CREATE INTEROP | |
// | |
// TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE | |
struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); | |
// | |
// RESIZE INTEROP | |
// | |
int width, height; | |
// get initial width/height | |
glfwGetFramebufferSize(window,&width,&height); | |
// resize with initial window dimensions | |
cuda_err = pxl_interop_size_set(interop,width,height); | |
// | |
// SET USER POINTER AND CALLBACKS | |
// | |
glfwSetWindowUserPointer (window,interop); | |
glfwSetKeyCallback (window,pxl_glfw_key_callback); | |
glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback); | |
// | |
// LOOP UNTIL DONE | |
// | |
while (!glfwWindowShouldClose(window)) | |
{ | |
// | |
// MONITOR FPS | |
// | |
pxl_glfw_fps(window); | |
// | |
// EXECUTE CUDA KERNEL ON RENDER BUFFER | |
// | |
int width,height; | |
cudaArray_t cuda_array; | |
pxl_interop_size_get(interop,&width,&height); | |
cuda_err = pxl_interop_map(interop,stream); | |
cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop), | |
width, | |
height, | |
event, | |
stream); | |
cuda_err = pxl_interop_unmap(interop,stream); | |
// | |
// BLIT & SWAP FBO | |
// | |
pxl_interop_blit(interop); | |
// pxl_interop_clear(interop); | |
pxl_interop_swap(interop); | |
// | |
// SWAP WINDOW | |
// | |
glfwSwapBuffers(window); | |
// | |
// PUMP/POLL/WAIT | |
// | |
glfwPollEvents(); // glfwWaitEvents(); | |
} | |
// | |
// CLEANUP | |
// | |
pxl_interop_destroy(interop); | |
glfwDestroyWindow(window); | |
glfwTerminate(); | |
cuda(DeviceReset()); | |
// missing some clean up here | |
exit(EXIT_SUCCESS); | |
} | |
// | |
// | |
// |
Yes, I think that's correct. I wrote this a very long time ago in order to understand CUDA>GL interop.
Today I would just use Vulkan.
Thanks @allanmac for your reply. The same thing I am currently trying to understand. I am using OptiX (latest version 7.4) and need to send the rendered data to opengl buffer. There is no such thing called OptiX->OGL interoperability, but CUDA to OpenGL possible. I have run your code on my machine, it's working perfectly, and now I'm trying to understand what you actually did. Can you suggest some resources about CUDA->OpenGL
interoperability?
If I remember correctly, there was no guide on this subject and I was mostly using the Runtime API docs and the GL interop example in the "CUDA Samples/" directory in the SDK.
It looks like the Samples/ are now on GitHub: https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/simpleGL
IMHO, there has not been much changed on this topic. I think it is not something that many developers really need. Thank you very much for the resource.
This sample code can only send buffer data from CUDA (any latest version) to OpenGL (the latest version is 4.6), right?