allanmac/assert_cuda.c

## assert_cuda.c
//
//
//

#include <stdlib.h>
#include <stdio.h>

//
//
//

#include "assert_cuda.h"

//
//
//

cudaError_t
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{
  if (code != cudaSuccess)
    {
      fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);

      if (abort)
        {
          cudaDeviceReset();
          exit(code);
        }
    }

  return code;
}

//
//
//

## assert_cuda.h
//
//
//

#pragma once

//
//
//

#include <cuda_runtime.h>
#include <stdbool.h>

//
// Beware that NVCC doesn't work with C files and __VA_ARGS__
//

cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);

#define cuda(...)  cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);

//
//
//

## interop.c

//
//
//

#include <glad/glad.h>
#include <GLFW/glfw3.h>
#include <cuda_gl_interop.h>
#include <stdlib.h>

//
//
//

#include "assert_cuda.h"
#include "interop.h"

//
//
//

struct pxl_interop
{
  // split GPUs?
  bool                    multi_gpu;

  // number of fbo's
  int                     count;
  int                     index;

  // w x h
  int                     width;
  int                     height;

  // GL buffers
  GLuint*                 fb;
  GLuint*                 rb;

  // CUDA resources
  cudaGraphicsResource_t* cgr;
  cudaArray_t*            ca;
};

//
//
//

struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count)
{
  struct pxl_interop* const interop = calloc(1,sizeof(*interop));

  interop->multi_gpu = multi_gpu;
  interop->count     = fbo_count;
  interop->index     = 0;

  // allocate arrays
  interop->fb  = calloc(fbo_count,sizeof(*(interop->fb )));
  interop->rb  = calloc(fbo_count,sizeof(*(interop->rb )));
  interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
  interop->ca  = calloc(fbo_count,sizeof(*(interop->ca)));

  // render buffer object w/a color buffer
  glCreateRenderbuffers(fbo_count,interop->rb);

  // frame buffer object
  glCreateFramebuffers(fbo_count,interop->fb);

  // attach rbo to fbo
  for (int index=0; index<fbo_count; index++)
    {
      glNamedFramebufferRenderbuffer(interop->fb[index],
                                     GL_COLOR_ATTACHMENT0,
                                     GL_RENDERBUFFER,
                                     interop->rb[index]);
    }

  // return it
  return interop;
}


void
pxl_interop_destroy(struct pxl_interop* const interop)
{
  cudaError_t cuda_err;

  // unregister CUDA resources
  for (int index=0; index<interop->count; index++)
    {
      if (interop->cgr[index] != NULL)
        cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
    }

  // delete rbo's
  glDeleteRenderbuffers(interop->count,interop->rb);

  // delete fbo's
  glDeleteFramebuffers(interop->count,interop->fb);

  // free buffers and resources
  free(interop->fb);
  free(interop->rb);
  free(interop->cgr);
  free(interop->ca);

  // free interop
  free(interop);
}

//
//
//

cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
{
  cudaError_t cuda_err = cudaSuccess;

  // save new size
  interop->width  = width;
  interop->height = height;

  // resize color buffer
  for (int index=0; index<interop->count; index++)
    {
      // unregister resource
      if (interop->cgr[index] != NULL)
        cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));

      // resize rbo
      glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);

      // probe fbo status
      // glCheckNamedFramebufferStatus(interop->fb[index],0);

      // register rbo
      cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
					      interop->rb[index],
					      GL_RENDERBUFFER,
					      cudaGraphicsRegisterFlagsSurfaceLoadStore |
					      cudaGraphicsRegisterFlagsWriteDiscard));
    }

  // map graphics resources
  cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));

  // get CUDA Array refernces
  for (int index=0; index<interop->count; index++)
    {
      cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
							interop->cgr[index],
							0,0));
    }

  // unmap graphics resources
  cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));

  return cuda_err;
}

void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
{
  *width  = interop->width;
  *height = interop->height;
}

//
//
//

cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
{
  if (!interop->multi_gpu)
    return cudaSuccess;

  // map graphics resources
  return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
}

cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
{
  if (!interop->multi_gpu)
    return cudaSuccess;

  return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
}

cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop)
{
  //
  // FIXME -- IS THIS EVEN NEEDED?
  //

  cudaError_t cuda_err;

  // get a CUDA Array
  cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
						    interop->cgr[interop->index],
						    0,0));
  return cuda_err;
}

//
//
//

cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop)
{
  return interop->ca[interop->index];
}

int
pxl_interop_index_get(struct pxl_interop* const interop)
{
  return interop->index;
}

//
//
//

void
pxl_interop_swap(struct pxl_interop* const interop)
{
  interop->index = (interop->index + 1) % interop->count;
}

//
//
//

void
pxl_interop_clear(struct pxl_interop* const interop)
{
  /*
  static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
  glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
  */

  const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
  glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
}

//
//
//

void
pxl_interop_blit(struct pxl_interop* const interop)
{
  glBlitNamedFramebuffer(interop->fb[interop->index],0,
                         0,0,              interop->width,interop->height,
                         0,interop->height,interop->width,0,
                         GL_COLOR_BUFFER_BIT,
                         GL_NEAREST);
}

//
//
//

## interop.h

//
//
//

#pragma once

//
//
//

#include <cuda_runtime.h>
#include <stdbool.h>

//
//
//

struct pxl_interop*
pxl_interop_create(const bool multi_gpu, const int fbo_count);

void
pxl_interop_destroy(struct pxl_interop* const interop);

//
//
//

cudaError_t
pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height);

void
pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height);

//
//
//

cudaError_t
pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);

cudaError_t
pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);

cudaError_t
pxl_interop_array_map(struct pxl_interop* const interop);

//
//
//

cudaArray_const_t
pxl_interop_array_get(struct pxl_interop* const interop);

cudaStream_t
pxl_interop_stream_get(struct pxl_interop* const interop);

int
pxl_interop_index_get(struct pxl_interop* const interop);

//
//
//

void
pxl_interop_swap(struct pxl_interop* const interop);

void
pxl_interop_clear(struct pxl_interop* const interop);

void
pxl_interop_blit(struct pxl_interop* const interop);

//
//
//

## kernel.cu
// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*-

//
//
//

#ifdef __cplusplus
extern "C" {
#endif

#include "assert_cuda.h"

#ifdef __cplusplus
}
#endif

//
//
//

#define PXL_KERNEL_THREADS_PER_BLOCK  256 // enough for 4Kx2 monitor

//
//
//

surface<void,cudaSurfaceType2D> surf;

//
//
//

union pxl_rgbx_24
{
  uint1       b32;

  struct {
    unsigned  r  : 8;
    unsigned  g  : 8;
    unsigned  b  : 8;
    unsigned  na : 8;
  };
};

//
//
//

extern "C"
__global__
void
pxl_kernel(const int width, const int height)
{
  // pixel coordinates
  const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
  const int x   = idx % width;
  const int y   = idx / width;

#if 1

  // pixel color
  const int          t    = (unsigned int)clock() / 1100000; // 1.1 GHz
  const int          xt   = (idx + t) % width;
  const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
  const unsigned int bar  = ((y + t) / 32) & 3;

  union pxl_rgbx_24  rgbx;

  rgbx.r  = (bar == 0) || (bar == 1) ? ramp : 0;
  rgbx.g  = (bar == 0) || (bar == 2) ? ramp : 0;
  rgbx.b  = (bar == 0) || (bar == 3) ? ramp : 0;
  rgbx.na = 255;

#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT

  const bool        border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
  union pxl_rgbx_24 rgbx   = { border ? 0xFF0000FF : 0xFF000000 };

#endif

  surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
    surf,
    x*sizeof(rgbx),
    y,
    cudaBoundaryModeZero); // squelches out-of-bound writes
}

//
//
//

extern "C"
cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
                    const int         width,
                    const int         height,
                    cudaEvent_t       event,
                    cudaStream_t      stream)
{
  cudaError_t cuda_err;

  // cuda_err = cudaEventRecord(event,stream);

  cuda_err = cuda(BindSurfaceToArray(surf,array));

  if (cuda_err)
    return cuda_err;

  const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

  // cuda_err = cudaEventRecord(event,stream);

  if (blocks > 0)
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);

  // cuda_err = cudaStreamWaitEvent(stream,event,0);

  return cudaSuccess;
}

//
//
//

## main.c
//
//
//

#include <glad/glad.h>
#include <GLFW/glfw3.h>

//
//
//

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>

//
//
//

#include <cuda_gl_interop.h>

//
//
//

#include "assert_cuda.h"
#include "interop.h"

//
// FPS COUNTER FROM HERE:
//
// http://antongerdelan.net/opengl/glcontext2.html
//

static
void
pxl_glfw_fps(GLFWwindow* window)
{
  // static fps counters
  static double stamp_prev  = 0.0;
  static int    frame_count = 0;

  // locals
  const double stamp_curr = glfwGetTime();
  const double elapsed    = stamp_curr - stamp_prev;

  if (elapsed > 0.5)
    {
      stamp_prev = stamp_curr;

      const double fps = (double)frame_count / elapsed;

      int  width, height;
      char tmp[64];

      glfwGetFramebufferSize(window,&width,&height);

      sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);

      glfwSetWindowTitle(window,tmp);

      frame_count = 0;
    }

  frame_count++;
}

//
//
//

static
void
pxl_glfw_error_callback(int error, const char* description)
{
  fputs(description,stderr);
}

static
void
pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
{
  if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
    glfwSetWindowShouldClose(window, GL_TRUE);
}

static
void
pxl_glfw_init(GLFWwindow** window, const int width, const int height)
{
  //
  // INITIALIZE GLFW/GLAD
  //

  glfwSetErrorCallback(pxl_glfw_error_callback);

  if (!glfwInit())
    exit(EXIT_FAILURE);

  glfwWindowHint(GLFW_DEPTH_BITS,            0);
  glfwWindowHint(GLFW_STENCIL_BITS,          0);

  glfwWindowHint(GLFW_SRGB_CAPABLE,          GL_TRUE);

  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);

  glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);

#ifdef PXL_FULLSCREEN
  GLFWmonitor*       monitor = glfwGetPrimaryMonitor();
  const GLFWvidmode* mode    = glfwGetVideoMode(monitor);
  *window                    = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL);
#else
  *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
#endif

  if (*window == NULL)
    {
      glfwTerminate();
      exit(EXIT_FAILURE);
    }

  glfwMakeContextCurrent(*window);

  // set up GLAD
  gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);

  // ignore vsync for now
  glfwSwapInterval(0);

  // only copy r/g/b
  glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);

  // enable SRGB
  // glEnable(GL_FRAMEBUFFER_SRGB);
}

//
//
//

static
void
pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
{
  // get context
  struct pxl_interop* const interop = glfwGetWindowUserPointer(window);

  pxl_interop_size_set(interop,width,height);
}

//
//
//

cudaError_t
pxl_kernel_launcher(cudaArray_const_t array,
                    const int         width,
                    const int         height,
                    cudaEvent_t       event,
                    cudaStream_t      stream);

//
//
//

int
main(int argc, char* argv[])
{
  //
  // INIT GLFW
  //
  GLFWwindow* window;

  pxl_glfw_init(&window,1024,1024);

  //
  // INIT CUDA
  //
  cudaError_t cuda_err;

  int gl_device_id,gl_device_count;
  cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));

  int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
  cuda_err = cuda(SetDevice(cuda_device_id));

  //
  // MULTI-GPU?
  //
  const bool multi_gpu = gl_device_id != cuda_device_id;

  //
  // INFO
  //
  struct cudaDeviceProp props;

  cuda_err = cuda(GetDeviceProperties(&props,gl_device_id));
  printf("GL   : %-24s (%2d)\n",props.name,props.multiProcessorCount);

  cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id));
  printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);

  //
  // CREATE CUDA STREAM & EVENT
  //
  cudaStream_t stream;
  cudaEvent_t  event;

  cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault));   // optionally ignore default stream behavior
  cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);

  //
  // CREATE INTEROP
  //
  // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
  struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);

  //
  // RESIZE INTEROP
  //

  int width, height;

  // get initial width/height
  glfwGetFramebufferSize(window,&width,&height);

  // resize with initial window dimensions
  cuda_err = pxl_interop_size_set(interop,width,height);

  //
  // SET USER POINTER AND CALLBACKS
  //
  glfwSetWindowUserPointer      (window,interop);
  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

  //
  // LOOP UNTIL DONE
  //
  while (!glfwWindowShouldClose(window))
    {
      //
      // MONITOR FPS
      //
      pxl_glfw_fps(window);

      //
      // EXECUTE CUDA KERNEL ON RENDER BUFFER
      //
      int         width,height;
      cudaArray_t cuda_array;

      pxl_interop_size_get(interop,&width,&height);

      cuda_err = pxl_interop_map(interop,stream);

      cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
                                     width,
                                     height,
                                     event,
                                     stream);

      cuda_err = pxl_interop_unmap(interop,stream);

      //
      // BLIT & SWAP FBO
      //
      pxl_interop_blit(interop);
      // pxl_interop_clear(interop);
      pxl_interop_swap(interop);

      //
      // SWAP WINDOW
      //
      glfwSwapBuffers(window);

      //
      // PUMP/POLL/WAIT
      //
      glfwPollEvents(); // glfwWaitEvents();
    }

  //
  // CLEANUP
  //
  pxl_interop_destroy(interop);

  glfwDestroyWindow(window);

  glfwTerminate();

  cuda(DeviceReset());

  // missing some clean up here

  exit(EXIT_SUCCESS);
}

//
//
//
	//
	//
	//

	#include <stdlib.h>
	#include <stdio.h>

	//
	//
	//

	#include "assert_cuda.h"

	//
	//
	//

	cudaError_t
	cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
	{
	if (code != cudaSuccess)
	{
	fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);

	if (abort)
	{
	cudaDeviceReset();
	exit(code);
	}
	}

	return code;
	}

	//
	//
	//
	//
	//
	//

	#pragma once

	//
	//
	//

	#include <cuda_runtime.h>
	#include <stdbool.h>

	//
	// Beware that NVCC doesn't work with C files and __VA_ARGS__
	//

	cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);

	#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);

	//
	//
	//

	//
	//
	//

	#include <glad/glad.h>
	#include <GLFW/glfw3.h>
	#include <cuda_gl_interop.h>
	#include <stdlib.h>

	//
	//
	//

	#include "assert_cuda.h"
	#include "interop.h"

	//
	//
	//

	struct pxl_interop
	{
	// split GPUs?
	bool multi_gpu;

	// number of fbo's
	int count;
	int index;

	// w x h
	int width;
	int height;

	// GL buffers
	GLuint* fb;
	GLuint* rb;

	// CUDA resources
	cudaGraphicsResource_t* cgr;
	cudaArray_t* ca;
	};

	//
	//
	//

	struct pxl_interop*
	pxl_interop_create(const bool multi_gpu, const int fbo_count)
	{
	struct pxl_interop* const interop = calloc(1,sizeof(*interop));

	interop->multi_gpu = multi_gpu;
	interop->count = fbo_count;
	interop->index = 0;

	// allocate arrays
	interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
	interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
	interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
	interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));

	// render buffer object w/a color buffer
	glCreateRenderbuffers(fbo_count,interop->rb);

	// frame buffer object
	glCreateFramebuffers(fbo_count,interop->fb);

	// attach rbo to fbo
	for (int index=0; index<fbo_count; index++)
	{
	glNamedFramebufferRenderbuffer(interop->fb[index],
	GL_COLOR_ATTACHMENT0,
	GL_RENDERBUFFER,
	interop->rb[index]);
	}

	// return it
	return interop;
	}


	void
	pxl_interop_destroy(struct pxl_interop* const interop)
	{
	cudaError_t cuda_err;

	// unregister CUDA resources
	for (int index=0; index<interop->count; index++)
	{
	if (interop->cgr[index] != NULL)
	cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
	}

	// delete rbo's
	glDeleteRenderbuffers(interop->count,interop->rb);

	// delete fbo's
	glDeleteFramebuffers(interop->count,interop->fb);

	// free buffers and resources
	free(interop->fb);
	free(interop->rb);
	free(interop->cgr);
	free(interop->ca);

	// free interop
	free(interop);
	}

	//
	//
	//

	cudaError_t
	pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
	{
	cudaError_t cuda_err = cudaSuccess;

	// save new size
	interop->width = width;
	interop->height = height;

	// resize color buffer
	for (int index=0; index<interop->count; index++)
	{
	// unregister resource
	if (interop->cgr[index] != NULL)
	cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));

	// resize rbo
	glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);

	// probe fbo status
	// glCheckNamedFramebufferStatus(interop->fb[index],0);

	// register rbo
	cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
	interop->rb[index],
	GL_RENDERBUFFER,
	cudaGraphicsRegisterFlagsSurfaceLoadStore \|
	cudaGraphicsRegisterFlagsWriteDiscard));
	}

	// map graphics resources
	cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));

	// get CUDA Array refernces
	for (int index=0; index<interop->count; index++)
	{
	cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
	interop->cgr[index],
	0,0));
	}

	// unmap graphics resources
	cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));

	return cuda_err;
	}

	void
	pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
	{
	*width = interop->width;
	*height = interop->height;
	}

	//
	//
	//

	cudaError_t
	pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
	{
	if (!interop->multi_gpu)
	return cudaSuccess;

	// map graphics resources
	return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
	}

	cudaError_t
	pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
	{
	if (!interop->multi_gpu)
	return cudaSuccess;

	return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
	}

	cudaError_t
	pxl_interop_array_map(struct pxl_interop* const interop)
	{
	//
	// FIXME -- IS THIS EVEN NEEDED?
	//

	cudaError_t cuda_err;

	// get a CUDA Array
	cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
	interop->cgr[interop->index],
	0,0));
	return cuda_err;
	}

	//
	//
	//

	cudaArray_const_t
	pxl_interop_array_get(struct pxl_interop* const interop)
	{
	return interop->ca[interop->index];
	}

	int
	pxl_interop_index_get(struct pxl_interop* const interop)
	{
	return interop->index;
	}

	//
	//
	//

	void
	pxl_interop_swap(struct pxl_interop* const interop)
	{
	interop->index = (interop->index + 1) % interop->count;
	}

	//
	//
	//

	void
	pxl_interop_clear(struct pxl_interop* const interop)
	{
	/*
	static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
	glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
	*/

	const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
	glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
	}

	//
	//
	//

	void
	pxl_interop_blit(struct pxl_interop* const interop)
	{
	glBlitNamedFramebuffer(interop->fb[interop->index],0,
	0,0, interop->width,interop->height,
	0,interop->height,interop->width,0,
	GL_COLOR_BUFFER_BIT,
	GL_NEAREST);
	}

	//
	//
	//
	// -- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; --

	//
	//
	//

	#ifdef __cplusplus
	extern "C" {
	#endif

	#include "assert_cuda.h"

	#ifdef __cplusplus
	}
	#endif

	//
	//
	//

	#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor

	//
	//
	//

	surface<void,cudaSurfaceType2D> surf;

	//
	//
	//

	union pxl_rgbx_24
	{
	uint1 b32;

	struct {
	unsigned r : 8;
	unsigned g : 8;
	unsigned b : 8;
	unsigned na : 8;
	};
	};

	//
	//
	//

	extern "C"
	__global__
	void
	pxl_kernel(const int width, const int height)
	{
	// pixel coordinates
	const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
	const int x = idx % width;
	const int y = idx / width;

	#if 1

	// pixel color
	const int t = (unsigned int)clock() / 1100000; // 1.1 GHz
	const int xt = (idx + t) % width;
	const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
	const unsigned int bar = ((y + t) / 32) & 3;

	union pxl_rgbx_24 rgbx;

	rgbx.r = (bar == 0) \|\| (bar == 1) ? ramp : 0;
	rgbx.g = (bar == 0) \|\| (bar == 2) ? ramp : 0;
	rgbx.b = (bar == 0) \|\| (bar == 3) ? ramp : 0;
	rgbx.na = 255;

	#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT

	const bool border = (x == 0) \|\| (x == width-1) \|\| (y == 0) \|\| (y == height-1);
	union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };

	#endif

	surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
	surf,
	x*sizeof(rgbx),
	y,
	cudaBoundaryModeZero); // squelches out-of-bound writes
	}

	//
	//
	//

	extern "C"
	cudaError_t
	pxl_kernel_launcher(cudaArray_const_t array,
	const int width,
	const int height,
	cudaEvent_t event,
	cudaStream_t stream)
	{
	cudaError_t cuda_err;

	// cuda_err = cudaEventRecord(event,stream);

	cuda_err = cuda(BindSurfaceToArray(surf,array));

	if (cuda_err)
	return cuda_err;

	const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

	// cuda_err = cudaEventRecord(event,stream);

	if (blocks > 0)
	pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);

	// cuda_err = cudaStreamWaitEvent(stream,event,0);

	return cudaSuccess;
	}

	//
	//
	//