Skip to content

Instantly share code, notes, and snippets.

@stgatilov
Created May 10, 2024 16:23
Show Gist options
  • Save stgatilov/0bb58bf5296c3dfabd2eecd8dbf42237 to your computer and use it in GitHub Desktop.
Save stgatilov/0bb58bf5296c3dfabd2eecd8dbf42237 to your computer and use it in GitHub Desktop.
numba.cuda: device buffer shared with OpenGL
import ctypes
import numpy as np
import numba.cuda as cuda
import cuda.cudart as curt
# source: https://numba.discourse.group/t/cuda-opengl-interop/1898
class OpenglBufferInCuda:
"""A mapping of OpenGL buffer into CUDA device memory."""
def __init__(self, glbuf):
"""Establishes mapping for given GL buffer object."""
self._glbuf = glbuf
err, self._cuglresource = curt.cudaGraphicsGLRegisterBuffer(
self._glbuf,
curt.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone
)
assert err == 0
(err,) = curt.cudaGraphicsMapResources(1, self._cuglresource, 0)
assert err == 0
(err, self._cuptr, self._cusize) = curt.cudaGraphicsResourceGetMappedPointer(self._cuglresource)
assert err == 0
def shutdown(self):
"""Breaks mapping."""
if self._cuglresource is not None:
(err,) = curt.cudaGraphicsUnmapResources(1, self._cuglresource, 0)
(err,) = curt.cudaGraphicsUnregisterResource(self._cuglresource)
self._cuglresource = None
def asarray(self, dtype = np.float32, shape = None, *, strides = None, order = "C"):
"""
Returns DeviceNDArray view on mapped buffer compatible with numba.cuda.
Unless shape is specified, 1D array of maximum size is returned.
"""
if shape is None:
shape = self._cusize // np.dtype(dtype).itemsize
shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(shape, strides, dtype, order)
datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize)
assert datasize <= self._cusize
ctx = cuda.current_context()
cptr = ctypes.c_uint64(self._cuptr)
mem = cuda.driver.MemoryPointer(ctx, cptr, datasize)
return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data = mem)
import contextlib
import glfw
from OpenGL.GL import *
import numpy as np
def raii(exitstack, obj, method = 'shutdown'):
"""Ensures given method is called on given object when ExitStack scope ends."""
exitstack.callback(getattr(obj, method))
return obj
# source: https://gist.github.com/DGriffin91/c986d9bce5a0549d6e5c207261636e62
@contextlib.contextmanager
def createGlfwWindow(window_name = "Python GLFW sample", width = 1280, height = 720, core = False, vsync = False):
assert glfw.init(), "Could not initialize GLFW"
glfw.window_hint(glfw.CONTEXT_VERSION_MAJOR, 3)
glfw.window_hint(glfw.CONTEXT_VERSION_MINOR, 3)
if core:
glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_CORE_PROFILE)
glfw.window_hint(glfw.OPENGL_FORWARD_COMPAT, 1)
else:
glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_COMPAT_PROFILE)
window = glfw.create_window(int(width), int(height), window_name, None, None)
assert window, "Could not create window and GL context"
glfw.make_context_current(window)
glfw.swap_interval(int(vsync))
try:
yield window
finally:
glfw.terminate()
class OpenglScreenBuffer:
"""Represents OpenGL buffer object with RGB8 data that can be blit to default framebuffer."""
def __init__(self, imgsize, *, channels = 3, dtype = np.uint8):
self.size = imgsize
self.channels = channels
self.dtype = dtype
# numba.cuda does not support images, so I'd better use buffers for interop
# since I'm lazy to write shaders, PBO is the only option
self.buffer = glGenBuffers(1)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
glBufferStorage(GL_PIXEL_UNPACK_BUFFER, self.size[0] * self.size[1] * channels * np.dtype(dtype).itemsize, None, 0)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)
# PBO cannot be bound to framebuffer, so we have to copy it to a texture
self._tex = glGenTextures(1)
glBindTexture(GL_TEXTURE_2D, self._tex)
formatmode = {
(3, np.uint8) : GL_RGB8,
(4, np.uint8) : GL_RGBA8,
(3, np.float32) : GL_RGB32F,
(4, np.float32) : GL_RGBA32F,
}[(self.channels, self.dtype)]
glTexStorage2D(GL_TEXTURE_2D, 1, formatmode, self.size[0], self.size[1])
glBindTexture(GL_TEXTURE_2D, 0)
# texture is attached to framebuffer, and framebuffer can be blit with scaling
self._fbo = glGenFramebuffers(1)
glBindFramebuffer(GL_FRAMEBUFFER, self._fbo)
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, self._tex, 0)
glBindFramebuffer(GL_FRAMEBUFFER, 0)
def blittoscreen(self, screenSize):
# copy PBO context to texture
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
glBindTexture(GL_TEXTURE_2D, self._tex)
colormode = [GL_RED, GL_RG, GL_RGB, GL_RGBA][self.channels - 1]
typemode = {np.uint8 : GL_UNSIGNED_BYTE, np.float32 : GL_FLOAT}[self.dtype]
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.size[0], self.size[1], colormode, typemode, None)
glBindTexture(GL_TEXTURE_2D, 0)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)
# and blit framebuffer with texture into screen/default framebuffer
glBindFramebuffer(GL_READ_FRAMEBUFFER, self._fbo)
glBlitFramebuffer(0, 0, self.size[0], self.size[1], 0, 0, screenSize[0], screenSize[1], GL_COLOR_BUFFER_BIT, GL_LINEAR)
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0)
def shutdown(self):
if self.buffer is not None:
glDeleteFramebuffers(1, [self._fbo])
glDeleteTextures(1, [self._tex])
glDeleteBuffers(1, [self.buffer])
self.buffer = None
import sys, contextlib, random, math
import glm
import imgui
import glfw
from OpenGL.GL import *
from imgui.integrations.glfw import GlfwRenderer
from PIL import Image
import numpy as np
import numba.cuda as cuda
from numba.cuda.random import create_xoroshiro128p_states
from numba_cuda_opengl import OpenglBufferInCuda
from opengl_utils import createGlfwWindow, OpenglScreenBuffer, raii
Vec3f = np.dtype([
('x', np.float32),
('y', np.float32),
('z', np.float32),
])
Vec3b = np.dtype([
('x', np.uint8),
('y', np.uint8),
('z', np.uint8),
])
Point = np.dtype([
('position', Vec3f),
('color', Vec3b),
('padding', np.uint8)
])
Cylinder = np.dtype([
('center', Vec3f),
('axisX', Vec3f),
('axisY', Vec3f),
('axisZ', Vec3f),
('radius', np.float32),
('height', np.float32),
('color', Vec3f),
])
@cuda.jit(device = True)
def clamp(x, l, r):
return min(max(x, l), r)
@cuda.jit
def generatePointCloud_kernel(cylinders, rngStates, points):
globalIdx = cuda.grid(1)
if globalIdx >= points.size:
return
cylIdx = int(math.floor(globalIdx * cylinders.size / points.size))
u = cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) * math.pi * 2
v = 2.0 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1.0
lx = math.cos(u)
ly = math.sin(u)
ctr = cylinders[cylIdx].center
axisX = cylinders[cylIdx].axisX
axisY = cylinders[cylIdx].axisY
axisZ = cylinders[cylIdx].axisZ
radius = cylinders[cylIdx].radius
height = cylinders[cylIdx].height
col = cylinders[cylIdx].color
points[globalIdx].position.x = ctr.x + radius * (axisX.x * lx + axisY.x * ly) + height * (axisZ.x * v)
points[globalIdx].position.y = ctr.y + radius * (axisX.y * lx + axisY.y * ly) + height * (axisZ.y * v)
points[globalIdx].position.z = ctr.z + radius * (axisX.z * lx + axisY.z * ly) + height * (axisZ.z * v)
dr = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
dg = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
db = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
points[globalIdx].color.x = round(255.0 * clamp(col.x + dr, 0.0, 1.0))
points[globalIdx].color.y = round(255.0 * clamp(col.y + dg, 0.0, 1.0))
points[globalIdx].color.z = round(255.0 * clamp(col.z + db, 0.0, 1.0))
def generatePointCloud(pointsBuffer):
cylinders = np.empty(10, dtype = Cylinder)
for c in range(cylinders.size):
center = glm.vec3()
center.x = random.uniform(-1.0, 1.0)
center.y = random.uniform(-1.0, 1.0)
center.z = random.uniform(-1.0, 1.0)
axisX = glm.vec3()
axisY = glm.vec3()
while True:
axisX.x = random.uniform(-1.0, 1.0)
axisX.y = random.uniform(-1.0, 1.0)
axisX.z = random.uniform(-1.0, 1.0)
if glm.length(axisX) > 1.0 and glm.length(axisX) < 0.5:
continue
axisX /= glm.length(axisX)
axisY.x = random.uniform(-1.0, 1.0)
axisY.y = random.uniform(-1.0, 1.0)
axisY.z = random.uniform(-1.0, 1.0)
if glm.length(axisY) > 1.0 and glm.length(axisY) < 0.5:
continue
axisY -= glm.dot(axisX, axisY) * axisX
if glm.length(axisY) < 0.3:
continue
axisY /= glm.length(axisY)
break
radius = random.uniform(0.1, 0.2)
height = random.uniform(0.5, 1.0)
color = glm.vec3()
color.x = random.uniform(0.3, 1.0)
color.y = random.uniform(0.3, 1.0)
color.z = random.uniform(0.3, 1.0)
def setVec(dst, src):
dst['x'] = src.x
dst['y'] = src.y
dst['z'] = src.z
setVec(cylinders[c]['center'], center)
setVec(cylinders[c]['axisX'], axisX)
setVec(cylinders[c]['axisY'], axisY)
setVec(cylinders[c]['axisZ'], glm.cross(axisX, axisY))
cylinders[c]['radius'] = radius
cylinders[c]['height'] = height
setVec(cylinders[c]['color'], color)
numPoints = pointsBuffer.size
rngStates = cuda.random.create_xoroshiro128p_states(numPoints, random.getrandbits(64))
generatePointCloud_kernel[math.ceil(numPoints / 256), 256](cylinders, rngStates, pointsBuffer)
cuda.synchronize()
@cuda.jit
def clearBuffers_kernel(depthBuffer, colorBuffer):
(x, y) = cuda.grid(2)
(ny, nx) = depthBuffer.shape
if x >= nx or y >= ny:
return
depthBuffer[y][x] = 1.0
colorBuffer[y][x].x = 0
colorBuffer[y][x].y = 0
colorBuffer[y][x].z = 0
@cuda.jit(device = True)
def transformPoint(worldPos, mvp, screenShape):
cposX = mvp[0][0] * worldPos.x + mvp[1][0] * worldPos.y + mvp[2][0] * worldPos.z + mvp[3][0]
cposY = mvp[0][1] * worldPos.x + mvp[1][1] * worldPos.y + mvp[2][1] * worldPos.z + mvp[3][1]
cposZ = mvp[0][2] * worldPos.x + mvp[1][2] * worldPos.y + mvp[2][2] * worldPos.z + mvp[3][2]
cposW = mvp[0][3] * worldPos.x + mvp[1][3] * worldPos.y + mvp[2][3] * worldPos.z + mvp[3][3]
if max(abs(cposX), abs(cposY), abs(cposZ)) >= cposW:
return -1, -1, -1
invW = 1.0 / cposW
sx = (((cposX * invW) + 1.0) * 0.5) * screenShape[1]
sy = (((cposY * invW) + 1.0) * 0.5) * screenShape[0]
depth = (((cposZ * invW) + 1.0) * 0.5)
return np.uint32(math.floor(sx)), np.uint32(math.floor(sy)), np.float32(depth)
@cuda.jit
def generateDepthBuffer_kernel(depthBuffer, pointsBuffer, mvp):
idx = cuda.grid(1)
if idx >= pointsBuffer.size:
return
pos = pointsBuffer[idx].position
pres = transformPoint(pos, mvp, depthBuffer.shape)
if pres[2] < 0.0:
return
cuda.atomic.min(depthBuffer, (pres[1], pres[0]), pres[2])
@cuda.jit
def generateColorBuffer_kernel(depthBuffer, pointsBuffer, colorBuffer, mvp):
idx = cuda.grid(1)
if idx >= pointsBuffer.size:
return
pos = pointsBuffer[idx].position
pres = transformPoint(pos, mvp, depthBuffer.shape)
if pres[2] < 0:
return
if pres[2] != depthBuffer[pres[1]][pres[0]]:
return
colorBuffer[pres[1]][pres[0]] = pointsBuffer[idx].color
def renderPointCloud(pointsBuffer, depthBuffer, colorBuffer, mvp):
numThreads = (16, 16)
numBlocks = (math.ceil(depthBuffer.shape[1] / 16), math.ceil(depthBuffer.shape[0] / 16))
clearBuffers_kernel[numBlocks, numThreads](depthBuffer, colorBuffer)
cuda.synchronize()
numThreads = 256
numBlocks = math.ceil(pointsBuffer.size / 256)
generateDepthBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, np.array(glm.transpose(mvp)))
cuda.synchronize()
numThreads = 256
numBlocks = math.ceil(pointsBuffer.size / 256)
generateColorBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, colorBuffer, np.array(glm.transpose(mvp)))
cuda.synchronize()
class GuiApp:
@staticmethod
def run():
with contextlib.ExitStack() as exitstack:
app = GuiApp(exitstack)
app.loop()
def __init__(self, exitstack, *, filename = None, numPoints = 10000000):
self.window = exitstack.enter_context(createGlfwWindow(
window_name = "Experimental point cloud viewer",
core = True
))
self.windowSize = glfw.get_framebuffer_size(self.window)
self.projectionMatrix = glm.perspective(math.pi / 3, self.windowSize[0] / self.windowSize[1], 1.0e-2, 100.0)
imgui.create_context()
self.imguiImpl = raii(exitstack, GlfwRenderer(self.window))
self.showImguiDemo = False
self.pitch = 0.0
self.yaw = 0.0
self.eye = glm.vec3(-3.0, 0.0, 0.0)
self.mouselook = False
self.pointCloudGl = glGenBuffers(1)
glBindBuffer(GL_ARRAY_BUFFER, self.pointCloudGl)
glBufferStorage(GL_ARRAY_BUFFER, numPoints * Point.itemsize, None, 0)
glBindBuffer(GL_ARRAY_BUFFER, 0)
self.pointCloudCuda = raii(exitstack, OpenglBufferInCuda(self.pointCloudGl)).asarray(dtype = Point)
generatePointCloud(self.pointCloudCuda)
windowShape = (self.windowSize[1], self.windowSize[0])
self.depthBufferCuda = cuda.device_array(windowShape, dtype = np.float32)
self.screenBuffer = raii(exitstack, OpenglScreenBuffer(self.windowSize))
self.screenBufferCuda = raii(exitstack, OpenglBufferInCuda(self.screenBuffer.buffer)).asarray(shape = windowShape, dtype = Vec3b)
def onKey(self, window, key, scancode, action, mods):
if action == glfw.PRESS:
if key == glfw.KEY_ESCAPE:
if self.mouselook:
self.mouselook = False
glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.FALSE)
glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_NORMAL)
else:
self.mouselook = True
glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_DISABLED)
glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.TRUE)
glfw.set_cursor_pos(self.window, 0, 0)
def onMouseMove(self, window, xpos, ypos):
if self.mouselook:
self.pitch -= ypos / self.windowSize[1] * 3
self.yaw -= xpos / self.windowSize[1] * 3
self.pitch = glm.clamp(self.pitch, -math.pi/2, math.pi/2)
self.yaw = math.fmod(self.yaw, math.pi*2)
glfw.set_cursor_pos(self.window, 0, 0)
def loop(self):
lastTime = glfw.get_time()
frameTime = 0.0
glfw.set_key_callback(self.window, self.onKey)
glfw.set_cursor_pos_callback(self.window, self.onMouseMove)
while not glfw.window_should_close(self.window):
glfw.poll_events()
self.imguiImpl.process_inputs()
forward = glm.vec3(
math.cos(self.pitch) * math.cos(self.yaw),
math.cos(self.pitch) * math.sin(self.yaw),
math.sin(self.pitch)
)
up = glm.vec3(
math.cos(self.pitch + math.pi/2) * math.cos(self.yaw),
math.cos(self.pitch + math.pi/2) * math.sin(self.yaw),
math.sin(self.pitch + math.pi/2)
)
df = 0
dr = 0
if glfw.get_key(self.window, glfw.KEY_W) == glfw.PRESS:
df += 1
if glfw.get_key(self.window, glfw.KEY_S) == glfw.PRESS:
df -= 1
if glfw.get_key(self.window, glfw.KEY_D) == glfw.PRESS:
dr += 1
if glfw.get_key(self.window, glfw.KEY_A) == glfw.PRESS:
dr -= 1
self.eye += (forward * df + glm.cross(forward, up) * dr) * frameTime * 3.0
self.viewMatrix = glm.lookAt(self.eye, self.eye + forward * (glm.length(self.eye) + 1), up)
mvp = self.projectionMatrix * self.viewMatrix
renderPointCloud(self.pointCloudCuda, self.depthBufferCuda, self.screenBufferCuda, mvp)
self.screenBuffer.blittoscreen(self.windowSize)
imgui.new_frame()
if imgui.begin("Settings", flags = imgui.WINDOW_ALWAYS_AUTO_RESIZE):
imgui.text("Frame time: %4.1f ms" % (frameTime * 1000.0))
_, self.showImguiDemo = imgui.checkbox("Show ImGui demo", self.showImguiDemo)
if self.showImguiDemo:
imgui.show_test_window()
imgui.end()
imgui.render()
self.imguiImpl.render(imgui.get_draw_data())
glfw.swap_buffers(self.window)
newTime = glfw.get_time()
frameTime = newTime - lastTime
lastTime = newTime
def main(argv):
GuiApp.run()
if __name__ == "__main__":
main(sys.argv[1:])
@stgatilov
Copy link
Author

You can put all files to same directory, then run pcviewer.py under Python 3 after installing enough pip modules.
You'll see some cylinders comprised of points on the screen.
Press Escape, then use WASD + mouselook to fly around.

This demo provides two pieces which might be useful:

  1. OpenglBufferInCuda --- takes OpenGL buffer object and returns numba.cuda array which points (uses CUDA + OpenGL interop).
  2. OpenglScreenBuffer --- maintains image buffer that can be blit to screen. Share this buffer to numba.cuda, and you can do any kind of realtime software rendering with numba.cuda!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment