stgatilov/numba_cuda_opengl.py

## numba_cuda_opengl.py
import ctypes
import numpy as np
import numba.cuda as cuda
import cuda.cudart as curt

# source: https://numba.discourse.group/t/cuda-opengl-interop/1898
class OpenglBufferInCuda:
    """A mapping of OpenGL buffer into CUDA device memory."""

    def __init__(self, glbuf):
        """Establishes mapping for given GL buffer object."""
        self._glbuf = glbuf

        err, self._cuglresource = curt.cudaGraphicsGLRegisterBuffer(
            self._glbuf,
            curt.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone
        )
        assert err == 0

        (err,) = curt.cudaGraphicsMapResources(1, self._cuglresource, 0)
        assert err == 0

        (err, self._cuptr, self._cusize) = curt.cudaGraphicsResourceGetMappedPointer(self._cuglresource)
        assert err == 0

    def shutdown(self):
        """Breaks mapping."""
        if self._cuglresource is not None:
            (err,) = curt.cudaGraphicsUnmapResources(1, self._cuglresource, 0)
            (err,) = curt.cudaGraphicsUnregisterResource(self._cuglresource)
            self._cuglresource = None

    def asarray(self, dtype = np.float32, shape = None, *, strides = None, order = "C"):
        """
        Returns DeviceNDArray view on mapped buffer compatible with numba.cuda.
        Unless shape is specified, 1D array of maximum size is returned.
        """
        if shape is None:
            shape = self._cusize // np.dtype(dtype).itemsize
        shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(shape, strides, dtype, order)
        datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize)
        assert datasize <= self._cusize

        ctx = cuda.current_context()
        cptr = ctypes.c_uint64(self._cuptr)
        mem = cuda.driver.MemoryPointer(ctx, cptr, datasize)
        return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data = mem)

## opengl_utils.py
import contextlib
import glfw
from OpenGL.GL import *
import numpy as np


def raii(exitstack, obj, method = 'shutdown'):
    """Ensures given method is called on given object when ExitStack scope ends."""
    exitstack.callback(getattr(obj, method))
    return obj


# source: https://gist.github.com/DGriffin91/c986d9bce5a0549d6e5c207261636e62
@contextlib.contextmanager
def createGlfwWindow(window_name = "Python GLFW sample", width = 1280, height = 720, core = False, vsync = False):
    assert glfw.init(), "Could not initialize GLFW"

    glfw.window_hint(glfw.CONTEXT_VERSION_MAJOR, 3)
    glfw.window_hint(glfw.CONTEXT_VERSION_MINOR, 3)
    if core:
        glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_CORE_PROFILE)
        glfw.window_hint(glfw.OPENGL_FORWARD_COMPAT, 1)
    else:
        glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_COMPAT_PROFILE)

    window = glfw.create_window(int(width), int(height), window_name, None, None)
    assert window, "Could not create window and GL context"
    glfw.make_context_current(window)

    glfw.swap_interval(int(vsync))

    try:
        yield window
    finally:
        glfw.terminate()


class OpenglScreenBuffer:
    """Represents OpenGL buffer object with RGB8 data that can be blit to default framebuffer."""

    def __init__(self, imgsize, *, channels = 3, dtype = np.uint8):
        self.size = imgsize
        self.channels = channels
        self.dtype = dtype

        # numba.cuda does not support images, so I'd better use buffers for interop
        # since I'm lazy to write shaders, PBO is the only option
        self.buffer = glGenBuffers(1)
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
        glBufferStorage(GL_PIXEL_UNPACK_BUFFER, self.size[0] * self.size[1] * channels * np.dtype(dtype).itemsize, None, 0)
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)

        # PBO cannot be bound to framebuffer, so we have to copy it to a texture
        self._tex = glGenTextures(1)
        glBindTexture(GL_TEXTURE_2D, self._tex)
        formatmode = {
            (3, np.uint8) : GL_RGB8,
            (4, np.uint8) : GL_RGBA8,
            (3, np.float32) : GL_RGB32F,
            (4, np.float32) : GL_RGBA32F,
        }[(self.channels, self.dtype)]
        glTexStorage2D(GL_TEXTURE_2D, 1, formatmode, self.size[0], self.size[1])
        glBindTexture(GL_TEXTURE_2D, 0)

        # texture is attached to framebuffer, and framebuffer can be blit with scaling
        self._fbo = glGenFramebuffers(1)
        glBindFramebuffer(GL_FRAMEBUFFER, self._fbo)
        glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, self._tex, 0)
        glBindFramebuffer(GL_FRAMEBUFFER, 0)

    def blittoscreen(self, screenSize):
        # copy PBO context to texture
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
        glBindTexture(GL_TEXTURE_2D, self._tex)
        colormode = [GL_RED, GL_RG, GL_RGB, GL_RGBA][self.channels - 1]
        typemode = {np.uint8 : GL_UNSIGNED_BYTE, np.float32 : GL_FLOAT}[self.dtype]
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.size[0], self.size[1], colormode, typemode, None)
        glBindTexture(GL_TEXTURE_2D, 0)
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)

        # and blit framebuffer with texture into screen/default framebuffer
        glBindFramebuffer(GL_READ_FRAMEBUFFER, self._fbo)
        glBlitFramebuffer(0, 0, self.size[0], self.size[1], 0, 0, screenSize[0], screenSize[1], GL_COLOR_BUFFER_BIT, GL_LINEAR)
        glBindFramebuffer(GL_READ_FRAMEBUFFER, 0)

    def shutdown(self):
        if self.buffer is not None:
            glDeleteFramebuffers(1, [self._fbo])
            glDeleteTextures(1, [self._tex])
            glDeleteBuffers(1, [self.buffer])
            self.buffer = None

## pcviewer.py
import sys, contextlib, random, math
import glm

import imgui
import glfw
from OpenGL.GL import *
from imgui.integrations.glfw import GlfwRenderer

from PIL import Image
import numpy as np
import numba.cuda as cuda
from numba.cuda.random import create_xoroshiro128p_states

from numba_cuda_opengl import OpenglBufferInCuda
from opengl_utils import createGlfwWindow, OpenglScreenBuffer, raii

Vec3f = np.dtype([
    ('x', np.float32),
    ('y', np.float32),
    ('z', np.float32),
])
Vec3b = np.dtype([
    ('x', np.uint8),
    ('y', np.uint8),
    ('z', np.uint8),
])
Point = np.dtype([
    ('position', Vec3f),
    ('color', Vec3b),
    ('padding', np.uint8)
])
Cylinder = np.dtype([
    ('center', Vec3f),
    ('axisX', Vec3f),
    ('axisY', Vec3f),
    ('axisZ', Vec3f),
    ('radius', np.float32),
    ('height', np.float32),
    ('color', Vec3f),
])

@cuda.jit(device = True)
def clamp(x, l, r):
    return min(max(x, l), r)

@cuda.jit
def generatePointCloud_kernel(cylinders, rngStates, points):
    globalIdx = cuda.grid(1)
    if globalIdx >= points.size:
        return
    cylIdx = int(math.floor(globalIdx * cylinders.size / points.size))

    u = cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) * math.pi * 2
    v = 2.0 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1.0
    lx = math.cos(u)
    ly = math.sin(u)

    ctr = cylinders[cylIdx].center
    axisX = cylinders[cylIdx].axisX
    axisY = cylinders[cylIdx].axisY
    axisZ = cylinders[cylIdx].axisZ
    radius = cylinders[cylIdx].radius
    height = cylinders[cylIdx].height
    col = cylinders[cylIdx].color

    points[globalIdx].position.x = ctr.x + radius * (axisX.x * lx + axisY.x * ly) + height * (axisZ.x * v)
    points[globalIdx].position.y = ctr.y + radius * (axisX.y * lx + axisY.y * ly) + height * (axisZ.y * v)
    points[globalIdx].position.z = ctr.z + radius * (axisX.z * lx + axisY.z * ly) + height * (axisZ.z * v)

    dr = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
    dg = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
    db = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
    points[globalIdx].color.x = round(255.0 * clamp(col.x + dr, 0.0, 1.0))
    points[globalIdx].color.y = round(255.0 * clamp(col.y + dg, 0.0, 1.0))
    points[globalIdx].color.z = round(255.0 * clamp(col.z + db, 0.0, 1.0))

def generatePointCloud(pointsBuffer):
    cylinders = np.empty(10, dtype = Cylinder)

    for c in range(cylinders.size):
        center = glm.vec3()
        center.x = random.uniform(-1.0, 1.0)
        center.y = random.uniform(-1.0, 1.0)
        center.z = random.uniform(-1.0, 1.0)

        axisX = glm.vec3()
        axisY = glm.vec3()
        while True:
            axisX.x = random.uniform(-1.0, 1.0)
            axisX.y = random.uniform(-1.0, 1.0)
            axisX.z = random.uniform(-1.0, 1.0)
            if glm.length(axisX) > 1.0 and glm.length(axisX) < 0.5:
                continue
            axisX /= glm.length(axisX)

            axisY.x = random.uniform(-1.0, 1.0)
            axisY.y = random.uniform(-1.0, 1.0)
            axisY.z = random.uniform(-1.0, 1.0)
            if glm.length(axisY) > 1.0 and glm.length(axisY) < 0.5:
                continue
            axisY -= glm.dot(axisX, axisY) * axisX
            if glm.length(axisY) < 0.3:
                continue
            axisY /= glm.length(axisY)

            break

        radius = random.uniform(0.1, 0.2)
        height = random.uniform(0.5, 1.0)

        color = glm.vec3()
        color.x = random.uniform(0.3, 1.0)
        color.y = random.uniform(0.3, 1.0)
        color.z = random.uniform(0.3, 1.0)

        def setVec(dst, src):
            dst['x'] = src.x
            dst['y'] = src.y
            dst['z'] = src.z
        setVec(cylinders[c]['center'], center)
        setVec(cylinders[c]['axisX'], axisX)
        setVec(cylinders[c]['axisY'], axisY)
        setVec(cylinders[c]['axisZ'], glm.cross(axisX, axisY))
        cylinders[c]['radius'] = radius
        cylinders[c]['height'] = height
        setVec(cylinders[c]['color'], color)

    numPoints = pointsBuffer.size
    rngStates = cuda.random.create_xoroshiro128p_states(numPoints, random.getrandbits(64))
    generatePointCloud_kernel[math.ceil(numPoints / 256), 256](cylinders, rngStates, pointsBuffer)
    cuda.synchronize()

@cuda.jit
def clearBuffers_kernel(depthBuffer, colorBuffer):
    (x, y) = cuda.grid(2)
    (ny, nx) = depthBuffer.shape
    if x >= nx or y >= ny:
        return
    depthBuffer[y][x] = 1.0
    colorBuffer[y][x].x = 0
    colorBuffer[y][x].y = 0
    colorBuffer[y][x].z = 0

@cuda.jit(device = True)
def transformPoint(worldPos, mvp, screenShape):
    cposX = mvp[0][0] * worldPos.x + mvp[1][0] * worldPos.y + mvp[2][0] * worldPos.z + mvp[3][0]
    cposY = mvp[0][1] * worldPos.x + mvp[1][1] * worldPos.y + mvp[2][1] * worldPos.z + mvp[3][1]
    cposZ = mvp[0][2] * worldPos.x + mvp[1][2] * worldPos.y + mvp[2][2] * worldPos.z + mvp[3][2]
    cposW = mvp[0][3] * worldPos.x + mvp[1][3] * worldPos.y + mvp[2][3] * worldPos.z + mvp[3][3]

    if max(abs(cposX), abs(cposY), abs(cposZ)) >= cposW:
        return -1, -1, -1

    invW = 1.0 / cposW
    sx = (((cposX * invW) + 1.0) * 0.5) * screenShape[1]
    sy = (((cposY * invW) + 1.0) * 0.5) * screenShape[0]
    depth = (((cposZ * invW) + 1.0) * 0.5)

    return np.uint32(math.floor(sx)), np.uint32(math.floor(sy)), np.float32(depth)

@cuda.jit
def generateDepthBuffer_kernel(depthBuffer, pointsBuffer, mvp):
    idx = cuda.grid(1)
    if idx >= pointsBuffer.size:
        return

    pos = pointsBuffer[idx].position
    pres = transformPoint(pos, mvp, depthBuffer.shape)
    if pres[2] < 0.0:
        return

    cuda.atomic.min(depthBuffer, (pres[1], pres[0]), pres[2])

@cuda.jit
def generateColorBuffer_kernel(depthBuffer, pointsBuffer, colorBuffer, mvp):
    idx = cuda.grid(1)
    if idx >= pointsBuffer.size:
        return

    pos = pointsBuffer[idx].position
    pres = transformPoint(pos, mvp, depthBuffer.shape)
    if pres[2] < 0:
        return
    if pres[2] != depthBuffer[pres[1]][pres[0]]:
        return

    colorBuffer[pres[1]][pres[0]] = pointsBuffer[idx].color

def renderPointCloud(pointsBuffer, depthBuffer, colorBuffer, mvp):
    numThreads = (16, 16)
    numBlocks = (math.ceil(depthBuffer.shape[1] / 16), math.ceil(depthBuffer.shape[0] / 16))
    clearBuffers_kernel[numBlocks, numThreads](depthBuffer, colorBuffer)
    cuda.synchronize()

    numThreads = 256
    numBlocks = math.ceil(pointsBuffer.size / 256)
    generateDepthBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, np.array(glm.transpose(mvp)))
    cuda.synchronize()

    numThreads = 256
    numBlocks = math.ceil(pointsBuffer.size / 256)
    generateColorBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, colorBuffer, np.array(glm.transpose(mvp)))
    cuda.synchronize()


class GuiApp:
    @staticmethod
    def run():
        with contextlib.ExitStack() as exitstack:
            app = GuiApp(exitstack)
            app.loop()

    def __init__(self, exitstack, *, filename = None, numPoints = 10000000):
        self.window = exitstack.enter_context(createGlfwWindow(
            window_name = "Experimental point cloud viewer",
            core = True
        ))

        self.windowSize = glfw.get_framebuffer_size(self.window)
        self.projectionMatrix = glm.perspective(math.pi / 3, self.windowSize[0] / self.windowSize[1], 1.0e-2, 100.0)

        imgui.create_context()
        self.imguiImpl = raii(exitstack, GlfwRenderer(self.window))

        self.showImguiDemo = False
        self.pitch = 0.0
        self.yaw = 0.0
        self.eye = glm.vec3(-3.0, 0.0, 0.0)
        self.mouselook = False

        self.pointCloudGl = glGenBuffers(1)
        glBindBuffer(GL_ARRAY_BUFFER, self.pointCloudGl)
        glBufferStorage(GL_ARRAY_BUFFER, numPoints * Point.itemsize, None, 0)
        glBindBuffer(GL_ARRAY_BUFFER, 0)

        self.pointCloudCuda = raii(exitstack, OpenglBufferInCuda(self.pointCloudGl)).asarray(dtype = Point)
        generatePointCloud(self.pointCloudCuda)

        windowShape = (self.windowSize[1], self.windowSize[0])
        self.depthBufferCuda = cuda.device_array(windowShape, dtype = np.float32)
        self.screenBuffer = raii(exitstack, OpenglScreenBuffer(self.windowSize))
        self.screenBufferCuda = raii(exitstack, OpenglBufferInCuda(self.screenBuffer.buffer)).asarray(shape = windowShape, dtype = Vec3b)

    def onKey(self, window, key, scancode, action, mods):
        if action == glfw.PRESS:
            if key == glfw.KEY_ESCAPE:
                if self.mouselook:
                    self.mouselook = False
                    glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.FALSE)
                    glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_NORMAL)
                else:
                    self.mouselook = True
                    glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_DISABLED)
                    glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.TRUE)
                    glfw.set_cursor_pos(self.window, 0, 0)

    def onMouseMove(self, window, xpos, ypos):
        if self.mouselook:
            self.pitch -= ypos / self.windowSize[1] * 3
            self.yaw -= xpos / self.windowSize[1] * 3
            self.pitch = glm.clamp(self.pitch, -math.pi/2, math.pi/2)
            self.yaw = math.fmod(self.yaw, math.pi*2)
            glfw.set_cursor_pos(self.window, 0, 0)

    def loop(self):
        lastTime = glfw.get_time()
        frameTime = 0.0

        glfw.set_key_callback(self.window, self.onKey)
        glfw.set_cursor_pos_callback(self.window, self.onMouseMove)

        while not glfw.window_should_close(self.window):
            glfw.poll_events()
            self.imguiImpl.process_inputs()

            forward = glm.vec3(
                math.cos(self.pitch) * math.cos(self.yaw),
                math.cos(self.pitch) * math.sin(self.yaw),
                math.sin(self.pitch)
            )
            up = glm.vec3(
                math.cos(self.pitch + math.pi/2) * math.cos(self.yaw),
                math.cos(self.pitch + math.pi/2) * math.sin(self.yaw),
                math.sin(self.pitch + math.pi/2)
            )
            df = 0
            dr = 0
            if glfw.get_key(self.window, glfw.KEY_W) == glfw.PRESS:
                df += 1
            if glfw.get_key(self.window, glfw.KEY_S) == glfw.PRESS:
                df -= 1
            if glfw.get_key(self.window, glfw.KEY_D) == glfw.PRESS:
                dr += 1
            if glfw.get_key(self.window, glfw.KEY_A) == glfw.PRESS:
                dr -= 1
            self.eye += (forward * df + glm.cross(forward, up) * dr) * frameTime * 3.0
            self.viewMatrix = glm.lookAt(self.eye, self.eye + forward * (glm.length(self.eye) + 1), up)

            mvp = self.projectionMatrix * self.viewMatrix

            renderPointCloud(self.pointCloudCuda, self.depthBufferCuda, self.screenBufferCuda, mvp)
            self.screenBuffer.blittoscreen(self.windowSize)

            imgui.new_frame()
            if imgui.begin("Settings", flags = imgui.WINDOW_ALWAYS_AUTO_RESIZE):
                imgui.text("Frame time: %4.1f ms" % (frameTime * 1000.0))

                _, self.showImguiDemo = imgui.checkbox("Show ImGui demo", self.showImguiDemo)
                if self.showImguiDemo:
                    imgui.show_test_window()

                imgui.end()
            imgui.render()
            self.imguiImpl.render(imgui.get_draw_data())

            glfw.swap_buffers(self.window)

            newTime = glfw.get_time()
            frameTime = newTime - lastTime
            lastTime = newTime


def main(argv):
    GuiApp.run()

if __name__ == "__main__":
    main(sys.argv[1:])
	import ctypes
	import numpy as np
	import numba.cuda as cuda
	import cuda.cudart as curt

	# source: https://numba.discourse.group/t/cuda-opengl-interop/1898
	class OpenglBufferInCuda:
	"""A mapping of OpenGL buffer into CUDA device memory."""

	def __init__(self, glbuf):
	"""Establishes mapping for given GL buffer object."""
	self._glbuf = glbuf

	err, self._cuglresource = curt.cudaGraphicsGLRegisterBuffer(
	self._glbuf,
	curt.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone
	)
	assert err == 0

	(err,) = curt.cudaGraphicsMapResources(1, self._cuglresource, 0)
	assert err == 0

	(err, self._cuptr, self._cusize) = curt.cudaGraphicsResourceGetMappedPointer(self._cuglresource)
	assert err == 0

	def shutdown(self):
	"""Breaks mapping."""
	if self._cuglresource is not None:
	(err,) = curt.cudaGraphicsUnmapResources(1, self._cuglresource, 0)
	(err,) = curt.cudaGraphicsUnregisterResource(self._cuglresource)
	self._cuglresource = None

	def asarray(self, dtype = np.float32, shape = None, *, strides = None, order = "C"):
	"""
	Returns DeviceNDArray view on mapped buffer compatible with numba.cuda.
	Unless shape is specified, 1D array of maximum size is returned.
	"""
	if shape is None:
	shape = self._cusize // np.dtype(dtype).itemsize
	shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(shape, strides, dtype, order)
	datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize)
	assert datasize <= self._cusize

	ctx = cuda.current_context()
	cptr = ctypes.c_uint64(self._cuptr)
	mem = cuda.driver.MemoryPointer(ctx, cptr, datasize)
	return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data = mem)
	import contextlib
	import glfw
	from OpenGL.GL import *
	import numpy as np


	def raii(exitstack, obj, method = 'shutdown'):
	"""Ensures given method is called on given object when ExitStack scope ends."""
	exitstack.callback(getattr(obj, method))
	return obj


	# source: https://gist.github.com/DGriffin91/c986d9bce5a0549d6e5c207261636e62
	@contextlib.contextmanager
	def createGlfwWindow(window_name = "Python GLFW sample", width = 1280, height = 720, core = False, vsync = False):
	assert glfw.init(), "Could not initialize GLFW"

	glfw.window_hint(glfw.CONTEXT_VERSION_MAJOR, 3)
	glfw.window_hint(glfw.CONTEXT_VERSION_MINOR, 3)
	if core:
	glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_CORE_PROFILE)
	glfw.window_hint(glfw.OPENGL_FORWARD_COMPAT, 1)
	else:
	glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_COMPAT_PROFILE)

	window = glfw.create_window(int(width), int(height), window_name, None, None)
	assert window, "Could not create window and GL context"
	glfw.make_context_current(window)

	glfw.swap_interval(int(vsync))

	try:
	yield window
	finally:
	glfw.terminate()


	class OpenglScreenBuffer:
	"""Represents OpenGL buffer object with RGB8 data that can be blit to default framebuffer."""

	def __init__(self, imgsize, *, channels = 3, dtype = np.uint8):
	self.size = imgsize
	self.channels = channels
	self.dtype = dtype

	# numba.cuda does not support images, so I'd better use buffers for interop
	# since I'm lazy to write shaders, PBO is the only option
	self.buffer = glGenBuffers(1)
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
	glBufferStorage(GL_PIXEL_UNPACK_BUFFER, self.size[0] * self.size[1] * channels * np.dtype(dtype).itemsize, None, 0)
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)

	# PBO cannot be bound to framebuffer, so we have to copy it to a texture
	self._tex = glGenTextures(1)
	glBindTexture(GL_TEXTURE_2D, self._tex)
	formatmode = {
	(3, np.uint8) : GL_RGB8,
	(4, np.uint8) : GL_RGBA8,
	(3, np.float32) : GL_RGB32F,
	(4, np.float32) : GL_RGBA32F,
	}[(self.channels, self.dtype)]
	glTexStorage2D(GL_TEXTURE_2D, 1, formatmode, self.size[0], self.size[1])
	glBindTexture(GL_TEXTURE_2D, 0)

	# texture is attached to framebuffer, and framebuffer can be blit with scaling
	self._fbo = glGenFramebuffers(1)
	glBindFramebuffer(GL_FRAMEBUFFER, self._fbo)
	glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, self._tex, 0)
	glBindFramebuffer(GL_FRAMEBUFFER, 0)

	def blittoscreen(self, screenSize):
	# copy PBO context to texture
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
	glBindTexture(GL_TEXTURE_2D, self._tex)
	colormode = [GL_RED, GL_RG, GL_RGB, GL_RGBA][self.channels - 1]
	typemode = {np.uint8 : GL_UNSIGNED_BYTE, np.float32 : GL_FLOAT}[self.dtype]
	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.size[0], self.size[1], colormode, typemode, None)
	glBindTexture(GL_TEXTURE_2D, 0)
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)

	# and blit framebuffer with texture into screen/default framebuffer
	glBindFramebuffer(GL_READ_FRAMEBUFFER, self._fbo)
	glBlitFramebuffer(0, 0, self.size[0], self.size[1], 0, 0, screenSize[0], screenSize[1], GL_COLOR_BUFFER_BIT, GL_LINEAR)
	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0)

	def shutdown(self):
	if self.buffer is not None:
	glDeleteFramebuffers(1, [self._fbo])
	glDeleteTextures(1, [self._tex])
	glDeleteBuffers(1, [self.buffer])
	self.buffer = None
	import sys, contextlib, random, math
	import glm

	import imgui
	import glfw
	from OpenGL.GL import *
	from imgui.integrations.glfw import GlfwRenderer

	from PIL import Image
	import numpy as np
	import numba.cuda as cuda
	from numba.cuda.random import create_xoroshiro128p_states

	from numba_cuda_opengl import OpenglBufferInCuda
	from opengl_utils import createGlfwWindow, OpenglScreenBuffer, raii

	Vec3f = np.dtype([
	('x', np.float32),
	('y', np.float32),
	('z', np.float32),
	])
	Vec3b = np.dtype([
	('x', np.uint8),
	('y', np.uint8),
	('z', np.uint8),
	])
	Point = np.dtype([
	('position', Vec3f),
	('color', Vec3b),
	('padding', np.uint8)
	])
	Cylinder = np.dtype([
	('center', Vec3f),
	('axisX', Vec3f),
	('axisY', Vec3f),
	('axisZ', Vec3f),
	('radius', np.float32),
	('height', np.float32),
	('color', Vec3f),
	])

	@cuda.jit(device = True)
	def clamp(x, l, r):
	return min(max(x, l), r)

	@cuda.jit
	def generatePointCloud_kernel(cylinders, rngStates, points):
	globalIdx = cuda.grid(1)
	if globalIdx >= points.size:
	return
	cylIdx = int(math.floor(globalIdx * cylinders.size / points.size))

	u = cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) * math.pi * 2
	v = 2.0 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1.0
	lx = math.cos(u)
	ly = math.sin(u)

	ctr = cylinders[cylIdx].center
	axisX = cylinders[cylIdx].axisX
	axisY = cylinders[cylIdx].axisY
	axisZ = cylinders[cylIdx].axisZ
	radius = cylinders[cylIdx].radius
	height = cylinders[cylIdx].height
	col = cylinders[cylIdx].color

	points[globalIdx].position.x = ctr.x + radius * (axisX.x * lx + axisY.x * ly) + height * (axisZ.x * v)
	points[globalIdx].position.y = ctr.y + radius * (axisX.y * lx + axisY.y * ly) + height * (axisZ.y * v)
	points[globalIdx].position.z = ctr.z + radius * (axisX.z * lx + axisY.z * ly) + height * (axisZ.z * v)

	dr = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
	dg = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
	db = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1
	points[globalIdx].color.x = round(255.0 * clamp(col.x + dr, 0.0, 1.0))
	points[globalIdx].color.y = round(255.0 * clamp(col.y + dg, 0.0, 1.0))
	points[globalIdx].color.z = round(255.0 * clamp(col.z + db, 0.0, 1.0))

	def generatePointCloud(pointsBuffer):
	cylinders = np.empty(10, dtype = Cylinder)

	for c in range(cylinders.size):
	center = glm.vec3()
	center.x = random.uniform(-1.0, 1.0)
	center.y = random.uniform(-1.0, 1.0)
	center.z = random.uniform(-1.0, 1.0)

	axisX = glm.vec3()
	axisY = glm.vec3()
	while True:
	axisX.x = random.uniform(-1.0, 1.0)
	axisX.y = random.uniform(-1.0, 1.0)
	axisX.z = random.uniform(-1.0, 1.0)
	if glm.length(axisX) > 1.0 and glm.length(axisX) < 0.5:
	continue
	axisX /= glm.length(axisX)

	axisY.x = random.uniform(-1.0, 1.0)
	axisY.y = random.uniform(-1.0, 1.0)
	axisY.z = random.uniform(-1.0, 1.0)
	if glm.length(axisY) > 1.0 and glm.length(axisY) < 0.5:
	continue
	axisY -= glm.dot(axisX, axisY) * axisX
	if glm.length(axisY) < 0.3:
	continue
	axisY /= glm.length(axisY)

	break

	radius = random.uniform(0.1, 0.2)
	height = random.uniform(0.5, 1.0)

	color = glm.vec3()
	color.x = random.uniform(0.3, 1.0)
	color.y = random.uniform(0.3, 1.0)
	color.z = random.uniform(0.3, 1.0)

	def setVec(dst, src):
	dst['x'] = src.x
	dst['y'] = src.y
	dst['z'] = src.z
	setVec(cylinders[c]['center'], center)
	setVec(cylinders[c]['axisX'], axisX)
	setVec(cylinders[c]['axisY'], axisY)
	setVec(cylinders[c]['axisZ'], glm.cross(axisX, axisY))
	cylinders[c]['radius'] = radius
	cylinders[c]['height'] = height
	setVec(cylinders[c]['color'], color)

	numPoints = pointsBuffer.size
	rngStates = cuda.random.create_xoroshiro128p_states(numPoints, random.getrandbits(64))
	generatePointCloud_kernel[math.ceil(numPoints / 256), 256](cylinders, rngStates, pointsBuffer)
	cuda.synchronize()

	@cuda.jit
	def clearBuffers_kernel(depthBuffer, colorBuffer):
	(x, y) = cuda.grid(2)
	(ny, nx) = depthBuffer.shape
	if x >= nx or y >= ny:
	return
	depthBuffer[y][x] = 1.0
	colorBuffer[y][x].x = 0
	colorBuffer[y][x].y = 0
	colorBuffer[y][x].z = 0

	@cuda.jit(device = True)
	def transformPoint(worldPos, mvp, screenShape):
	cposX = mvp[0][0] * worldPos.x + mvp[1][0] * worldPos.y + mvp[2][0] * worldPos.z + mvp[3][0]
	cposY = mvp[0][1] * worldPos.x + mvp[1][1] * worldPos.y + mvp[2][1] * worldPos.z + mvp[3][1]
	cposZ = mvp[0][2] * worldPos.x + mvp[1][2] * worldPos.y + mvp[2][2] * worldPos.z + mvp[3][2]
	cposW = mvp[0][3] * worldPos.x + mvp[1][3] * worldPos.y + mvp[2][3] * worldPos.z + mvp[3][3]

	if max(abs(cposX), abs(cposY), abs(cposZ)) >= cposW:
	return -1, -1, -1

	invW = 1.0 / cposW
	sx = (((cposX * invW) + 1.0) * 0.5) * screenShape[1]
	sy = (((cposY * invW) + 1.0) * 0.5) * screenShape[0]
	depth = (((cposZ * invW) + 1.0) * 0.5)

	return np.uint32(math.floor(sx)), np.uint32(math.floor(sy)), np.float32(depth)

	@cuda.jit
	def generateDepthBuffer_kernel(depthBuffer, pointsBuffer, mvp):
	idx = cuda.grid(1)
	if idx >= pointsBuffer.size:
	return

	pos = pointsBuffer[idx].position
	pres = transformPoint(pos, mvp, depthBuffer.shape)
	if pres[2] < 0.0:
	return

	cuda.atomic.min(depthBuffer, (pres[1], pres[0]), pres[2])

	@cuda.jit
	def generateColorBuffer_kernel(depthBuffer, pointsBuffer, colorBuffer, mvp):
	idx = cuda.grid(1)
	if idx >= pointsBuffer.size:
	return

	pos = pointsBuffer[idx].position
	pres = transformPoint(pos, mvp, depthBuffer.shape)
	if pres[2] < 0:
	return
	if pres[2] != depthBuffer[pres[1]][pres[0]]:
	return

	colorBuffer[pres[1]][pres[0]] = pointsBuffer[idx].color

	def renderPointCloud(pointsBuffer, depthBuffer, colorBuffer, mvp):
	numThreads = (16, 16)
	numBlocks = (math.ceil(depthBuffer.shape[1] / 16), math.ceil(depthBuffer.shape[0] / 16))
	clearBuffers_kernel[numBlocks, numThreads](depthBuffer, colorBuffer)
	cuda.synchronize()

	numThreads = 256
	numBlocks = math.ceil(pointsBuffer.size / 256)
	generateDepthBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, np.array(glm.transpose(mvp)))
	cuda.synchronize()

	numThreads = 256
	numBlocks = math.ceil(pointsBuffer.size / 256)
	generateColorBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, colorBuffer, np.array(glm.transpose(mvp)))
	cuda.synchronize()



	class GuiApp:
	@staticmethod
	def run():
	with contextlib.ExitStack() as exitstack:
	app = GuiApp(exitstack)
	app.loop()

	def __init__(self, exitstack, *, filename = None, numPoints = 10000000):
	self.window = exitstack.enter_context(createGlfwWindow(
	window_name = "Experimental point cloud viewer",
	core = True
	))

	self.windowSize = glfw.get_framebuffer_size(self.window)
	self.projectionMatrix = glm.perspective(math.pi / 3, self.windowSize[0] / self.windowSize[1], 1.0e-2, 100.0)

	imgui.create_context()
	self.imguiImpl = raii(exitstack, GlfwRenderer(self.window))

	self.showImguiDemo = False
	self.pitch = 0.0
	self.yaw = 0.0
	self.eye = glm.vec3(-3.0, 0.0, 0.0)
	self.mouselook = False

	self.pointCloudGl = glGenBuffers(1)
	glBindBuffer(GL_ARRAY_BUFFER, self.pointCloudGl)
	glBufferStorage(GL_ARRAY_BUFFER, numPoints * Point.itemsize, None, 0)
	glBindBuffer(GL_ARRAY_BUFFER, 0)

	self.pointCloudCuda = raii(exitstack, OpenglBufferInCuda(self.pointCloudGl)).asarray(dtype = Point)
	generatePointCloud(self.pointCloudCuda)

	windowShape = (self.windowSize[1], self.windowSize[0])
	self.depthBufferCuda = cuda.device_array(windowShape, dtype = np.float32)
	self.screenBuffer = raii(exitstack, OpenglScreenBuffer(self.windowSize))
	self.screenBufferCuda = raii(exitstack, OpenglBufferInCuda(self.screenBuffer.buffer)).asarray(shape = windowShape, dtype = Vec3b)

	def onKey(self, window, key, scancode, action, mods):
	if action == glfw.PRESS:
	if key == glfw.KEY_ESCAPE:
	if self.mouselook:
	self.mouselook = False
	glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.FALSE)
	glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_NORMAL)
	else:
	self.mouselook = True
	glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_DISABLED)
	glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.TRUE)
	glfw.set_cursor_pos(self.window, 0, 0)

	def onMouseMove(self, window, xpos, ypos):
	if self.mouselook:
	self.pitch -= ypos / self.windowSize[1] * 3
	self.yaw -= xpos / self.windowSize[1] * 3
	self.pitch = glm.clamp(self.pitch, -math.pi/2, math.pi/2)
	self.yaw = math.fmod(self.yaw, math.pi*2)
	glfw.set_cursor_pos(self.window, 0, 0)

	def loop(self):
	lastTime = glfw.get_time()
	frameTime = 0.0

	glfw.set_key_callback(self.window, self.onKey)
	glfw.set_cursor_pos_callback(self.window, self.onMouseMove)

	while not glfw.window_should_close(self.window):
	glfw.poll_events()
	self.imguiImpl.process_inputs()

	forward = glm.vec3(
	math.cos(self.pitch) * math.cos(self.yaw),
	math.cos(self.pitch) * math.sin(self.yaw),
	math.sin(self.pitch)
	)
	up = glm.vec3(
	math.cos(self.pitch + math.pi/2) * math.cos(self.yaw),
	math.cos(self.pitch + math.pi/2) * math.sin(self.yaw),
	math.sin(self.pitch + math.pi/2)
	)
	df = 0
	dr = 0
	if glfw.get_key(self.window, glfw.KEY_W) == glfw.PRESS:
	df += 1
	if glfw.get_key(self.window, glfw.KEY_S) == glfw.PRESS:
	df -= 1
	if glfw.get_key(self.window, glfw.KEY_D) == glfw.PRESS:
	dr += 1
	if glfw.get_key(self.window, glfw.KEY_A) == glfw.PRESS:
	dr -= 1
	self.eye += (forward * df + glm.cross(forward, up) * dr) * frameTime * 3.0
	self.viewMatrix = glm.lookAt(self.eye, self.eye + forward * (glm.length(self.eye) + 1), up)

	mvp = self.projectionMatrix * self.viewMatrix

	renderPointCloud(self.pointCloudCuda, self.depthBufferCuda, self.screenBufferCuda, mvp)
	self.screenBuffer.blittoscreen(self.windowSize)

	imgui.new_frame()
	if imgui.begin("Settings", flags = imgui.WINDOW_ALWAYS_AUTO_RESIZE):
	imgui.text("Frame time: %4.1f ms" % (frameTime * 1000.0))

	_, self.showImguiDemo = imgui.checkbox("Show ImGui demo", self.showImguiDemo)
	if self.showImguiDemo:
	imgui.show_test_window()

	imgui.end()
	imgui.render()
	self.imguiImpl.render(imgui.get_draw_data())

	glfw.swap_buffers(self.window)

	newTime = glfw.get_time()
	frameTime = newTime - lastTime
	lastTime = newTime


	def main(argv):
	GuiApp.run()

	if __name__ == "__main__":
	main(sys.argv[1:])