Last active
December 28, 2023 02:44
Star
You must be signed in to star a gist
(Minimal) OpenGL to CUDA PBO example, purpose of this example is to evaluate why depth transfer is so slow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// (Minimal) OpenGL to CUDA PBO example | |
// Maps the default OpenGL depth buffer to CUDA using GL_PIXEL_PACK_BUFFER_ARB | |
// Purpose of this example is to evaluate why depth transfer is so slow | |
// Play around with the example by commenting/uncommenting code in lines 77 ff. and in lines 110/112 | |
// | |
// In order to reproduce the issue, you require: | |
// - CUDA (tested with CUDA toolkit 7.5) | |
// - GLEW (a version with support for GL_KHR_debug) | |
// - (e.g.) freeglut (we need an OpenGL Debug context!) | |
// | |
// On Ubuntu 14.04, this example then compiles with the following command line | |
// - nvcc main.cu -lglut -lGLEW -lGL | |
// | |
#include <assert.h> | |
#include <stdio.h> | |
#include <GL/glew.h> | |
#include <GL/glut.h> | |
#include <GL/freeglut_ext.h> | |
#include <cuda_gl_interop.h> | |
#define WIDTH 800 | |
#define HEIGHT 800 | |
#define cutilSafeCall(err) __cudaSafeCall(err,__FILE__,__LINE__) | |
inline void __cudaSafeCall(cudaError err, | |
const char *file, const int line){ | |
if(cudaSuccess != err) { | |
printf("%s(%i) : cutilSafeCall() Runtime API error : %s.\n", | |
file, line, cudaGetErrorString(err) ); | |
exit(-1); | |
} | |
} | |
// Some dummy kernel to prevent optimizations | |
__global__ void kernel(unsigned*) | |
{ | |
} | |
// Debug callback for use with GL_KHR_debug | |
void debug_callback_func( | |
GLenum /*source*/, | |
GLenum type, | |
GLuint /*id*/, | |
GLenum severity, | |
GLsizei /*length*/, | |
const GLchar* message, | |
GLvoid* /*user_param*/ | |
) | |
{ | |
printf("%s\n", message); | |
} | |
// gl2cuda maps the opengl default frame buffer to cuda | |
void gl2cuda(int pitch, int h, GLenum format, GLenum type) | |
{ | |
GLuint pbo = 0; | |
cudaGraphicsResource_t resource = 0; | |
void* device_ptr = 0; | |
// Setup the PBO and register with CUDA | |
glGenBuffers(1, &pbo); | |
glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo); | |
glBufferData(GL_PIXEL_PACK_BUFFER, pitch * h, 0, GL_STREAM_COPY); | |
cutilSafeCall( cudaGraphicsGLRegisterBuffer(&resource, pbo, cudaGraphicsRegisterFlagsReadOnly) ); | |
// Let's say format is GL_DEPTH_STENCIL. The following is a workaround | |
// which provides me with the correct results and is fast but is of course | |
// only viable when I don't need the default color buffer anymore afterwards | |
// assert(format == GL_DEPTH_STENCIL); | |
// glCopyPixels(0, 0, WIDTH, HEIGHT, GL_DEPTH_STENCIL_TO_RGBA_NV); | |
// format = GL_BGRA; | |
// type = GL_UNSIGNED_BYTE; | |
glReadPixels(0, 0, WIDTH, HEIGHT, format, type, 0); | |
// Map the graphics resource | |
cutilSafeCall( cudaGraphicsMapResources(1, &resource) ); | |
size_t size = 0; | |
cutilSafeCall( cudaGraphicsResourceGetMappedPointer(&device_ptr, &size, resource) ); | |
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); | |
// "Use" the data | |
kernel<<<1, 1>>>((unsigned*)device_ptr); | |
// Unmap and unregister the graphics resource | |
cutilSafeCall( cudaGraphicsUnmapResources(1, &resource) ); | |
cutilSafeCall( cudaGraphicsUnregisterResource(resource) ); | |
// Delete the PBO | |
glDeleteBuffers(1, &pbo); | |
} | |
// Display function, issues gl2cuda | |
void display_func() | |
{ | |
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); | |
// Readback with depth/stencil is achingly slow (alas you employ the workaround from line 77 ff.) | |
gl2cuda(WIDTH * sizeof(unsigned), HEIGHT, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8); | |
// Readback of colors (for comparison) is as fast as expected | |
// gl2cuda(WIDTH * sizeof(unsigned), HEIGHT, GL_BGRA, GL_UNSIGNED_BYTE); | |
glutSwapBuffers(); | |
} | |
int main(int argc, char** argv) | |
{ | |
glutInitWindowSize(WIDTH, HEIGHT); | |
glutInit(&argc, argv); | |
// Need freeglut for GLUT_DEBUG! | |
glutInitContextFlags(GLUT_DEBUG); | |
glutInitDisplayMode(GLUT_RGBA | GLUT_DEPTH | GLUT_STENCIL | GLUT_DOUBLE); | |
glutCreateWindow("Depth readback example"); | |
glewInit(); | |
// Init GL debug callback to show performance issues | |
if (GLEW_KHR_debug) | |
{ | |
glEnable(GL_DEBUG_OUTPUT); | |
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); | |
glDebugMessageCallback((GLDEBUGPROC)debug_callback_func, 0); | |
} | |
else | |
{ | |
printf("No GLEW_KHR_debug!"); | |
} | |
glutDisplayFunc(display_func); | |
glutMainLoop(); | |
} |
From this example I would think that you in fact want to use the depth buffer values in a cuda kernel?
If so, then going to host and back would be superfluous.
I am no expert in OpenGL. However, I do know the following.
PBO are GL 3.0 I believe and kind of out dated (cuda example with PBO doesn't work on my computer). See docs.gl.
Try the following:
- Create a framebuffer (FBO).
- Attach color and depth component.
- Associate the depth buffer of the FBO with a cuda resource. You may need the 'image' version of 'cudaGraphicsGLRegisterBuffer', so 'cudaGraphicsGLRegisterImage'.
- Bind FBO and do 'normal' drawing.
- Unbind FBO
6, Map the cuda resouce from 'cudaGraphicsGLRegisterBuffer/Image'. - Launch cuda kernel and fetch just rendered depth component in FBO.
- Blit FBO with screen in 'a' way. Eg, glBlitFrameBuffer or 'draw' full screen quad with FBO color attachment bound to a sampler.
Framebuffer example: https://open.gl/framebuffers
Side note: You may leave a buffer 'mapped'. This may cause async issues though. Unmapping a resource causes cuda to sync the compute shader with the host if im not mistaken.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I wonder why mapping and using the default OpenGL depth buffer (assuming DEPTH24_STENCIL8) with CUDA causes a device to host transfer. Am I doing something wrong? Mapping RGBA colors (delete line 110 and uncomment line 112) causes no such transfer and is fast.
Compile this example with
nvcc main.cu -lglut -lGLEW -lGL