Skip to content

Instantly share code, notes, and snippets.

@danieloneill
Created February 28, 2023 04:15
Show Gist options
  • Save danieloneill/9845077a1fe498e2272f6248eceac88f to your computer and use it in GitHub Desktop.
Save danieloneill/9845077a1fe498e2272f6248eceac88f to your computer and use it in GitHub Desktop.
(C) Benchmark for vertically flipping a 320x240x24bpp image using different CPU and GPU methods, Linux/SDL/CUDA
#define MODE_GPU 0
#define MODE_GPU_MEMCPY 1
#define MODE_CPU 2
#define MODE_CPU_MEMCPY 3
all:
/usr/local/cuda-12.0/bin/nvcc -o readbmp readbmp.cu sdltest.cu -g `sdl2-config --cflags --libs` -lcuda
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "bleh.h"
typedef struct {
int w;
int h;
uint8_t *data;
size_t length;
} BMP;
int readbmp(const char *path, BMP *bmp)
{
memset(bmp, 0, sizeof(BMP));
int fd = open(path, 0);
if( -1 == fd )
return -1;
struct {
char hf[2];
uint8_t idgaf[8];
uint32_t offset __attribute__((packed));
} bmpheader;
struct {
uint32_t hlen;
int32_t width;
int32_t height;
uint16_t planes;
uint16_t bpp;
uint32_t comp;
uint32_t imgsize;
int32_t hres;
int32_t yres;
uint32_t palcolcount;
uint32_t impcols;
} dibheader;
if( 14 > read(fd, &bmpheader, 14 ) )
return -3;
if( 40 > read(fd, &dibheader, 40 ) )
return -4;
printf("%dx%d @ %dbpp (%d) @%x\n", dibheader.width, dibheader.height, dibheader.bpp, dibheader.comp, bmpheader.offset);
if( dibheader.comp != 0 || dibheader.bpp != 24 )
{
fprintf(stderr, "I only read Uncompressed 24bpp BMPs, no alpha, huffman, or RLE.\n");
close(fd);
return -2;
}
size_t dlen = dibheader.width * dibheader.height * 3;
uint8_t *data = (uint8_t *)malloc(dlen);
read(fd, data, dlen);
bmp->w = dibheader.width;
bmp->h = dibheader.height;
bmp->data = data;
bmp->length = dlen;
close(fd);
return 0;
}
bool setupSDL(int w, int h);
bool renderSDL();
void shutdownSDL();
void blitData(const uint8_t *data, int useMode);
int usage(char *pname)
{
fprintf(stderr, "Usage: %s <-c|-C|-g|-G>\n\t-c - CPU, interpolation method\n\t-C - CPU, memcpy method\n\t-g - CUDA, interpolation method\n\t-G - CUDA, memcpy method\n\n", pname);
return -1;
}
int main(int argc, char **argv)
{
int useMode = MODE_CPU;
if( argc > 1 )
{
if( strcmp(argv[1], "-c") == 0 )
useMode = MODE_CPU;
else if( strcmp(argv[1], "-C") == 0 )
useMode = MODE_CPU_MEMCPY;
else if( strcmp(argv[1], "-g") == 0 )
useMode = MODE_GPU;
else if( strcmp(argv[1], "-G") == 0 )
useMode = MODE_GPU_MEMCPY;
else
return usage(argv[0]);
} else
return usage(argv[0]);
BMP bmp;
int rval = readbmp("test.bmp", &bmp);
if( rval != 0 )
return rval;
if( !setupSDL(bmp.w, bmp.h) )
return -4;
while( true == renderSDL() ) {
blitData(bmp.data, useMode);
}
if( 0 != bmp.data )
free(bmp.data);
return 0;
}
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/time.h>
#include <SDL.h>
#include <cuda_runtime.h>
#include "bleh.h"
bool Running = true;
SDL_Window* Window = NULL;
SDL_Surface* PrimarySurface = NULL;
SDL_Surface *imageSurface;
uint8_t *cudaSrc = NULL;
uint8_t *cudaDst = NULL;
void timespec_diff(struct timespec *start, struct timespec *stop,
struct timespec *result)
{
if ((stop->tv_nsec - start->tv_nsec) < 0) {
result->tv_sec = stop->tv_sec - start->tv_sec - 1;
result->tv_nsec = stop->tv_nsec - start->tv_nsec + 1000000000;
} else {
result->tv_sec = stop->tv_sec - start->tv_sec;
result->tv_nsec = stop->tv_nsec - start->tv_nsec;
}
return;
}
#define COPYCLASS float
__device__ void devCpyCplx(const COPYCLASS *in, COPYCLASS *out, int len)
{
for (int i=0; i < len/sizeof(COPYCLASS); ++i)
out[i] = in[i];
}
__global__ void flipImage(uint8_t *src, uint8_t *dest, size_t stride, int rowCount, bool mc)
{
// Row # is threadIdx...
if( threadIdx.x > rowCount )
return;
int srcRow = threadIdx.x;
int dstRow = rowCount - threadIdx.x;
uint8_t *sPtr = src + (stride * srcRow);
uint8_t *dPtr = dest + (stride * dstRow);
if( false == mc )
devCpyCplx((COPYCLASS*)sPtr, (COPYCLASS*)dPtr, stride);
else
memcpy(dPtr, sPtr, stride);
}
bool setupSDL(int w, int h)
{
if(SDL_Init(SDL_INIT_VIDEO) < 0) {
fprintf(stderr, "Unable to Init SDL: %s\n", SDL_GetError());
return false;
}
if(!SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1")) {
fprintf(stderr, "Unable to Init hinting: %s\n", SDL_GetError());
}
if((Window = SDL_CreateWindow(
"SDL",
SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
w, h, SDL_WINDOW_SHOWN)
) == NULL) {
fprintf(stderr, "Unable to create SDL Window: %s\n", SDL_GetError());
return false;
}
PrimarySurface = SDL_GetWindowSurface( Window );
if( !PrimarySurface )
{
fprintf(stderr, "Failed to get window surface.\n");
return false;
}
Running = true;
return true;
}
bool setupCuda(int w, int h)
{
if( !cudaSrc )
cudaMalloc(&cudaSrc, w * h * 3);
if( !cudaDst )
cudaMalloc(&cudaDst, w * h * 3);
return cudaSrc && cudaDst;
}
void freeCuda()
{
if( cudaSrc )
cudaFree(cudaSrc);
if( cudaDst )
cudaFree(cudaDst);
cudaSrc = NULL;
cudaDst = NULL;
}
void blitDataCuda(const uint8_t *data, bool mc)
{
size_t imgSize = imageSurface->w * imageSurface->h * 3;
bool rval = setupCuda(imageSurface->w, imageSurface->h);
cudaMemcpy(cudaSrc, data, imgSize, cudaMemcpyHostToDevice);
flipImage <<< 1, imageSurface->h >>> (cudaSrc, cudaDst, imageSurface->w * 3, imageSurface->h, mc);
cudaDeviceSynchronize();
SDL_LockSurface(imageSurface);
cudaMemcpy(imageSurface->pixels, cudaDst, imgSize, cudaMemcpyDeviceToHost);
SDL_UnlockSurface(imageSurface);
}
void blitDataCPU(const COPYCLASS *data)
{
COPYCLASS *dst = (COPYCLASS *)imageSurface->pixels;
int stride = imageSurface->w * 3 / sizeof(COPYCLASS);
SDL_LockSurface(imageSurface);
for( int y=0; y < imageSurface->h; y++ )
{
int invy = imageSurface->h - y - 1;
for( int x=0; x < stride; x++ )
{
uint32_t dpos = invy * stride + x;
uint32_t spos = y * stride + x;
dst[dpos] = data[spos];
}
}
SDL_UnlockSurface(imageSurface);
}
void blitDataCPUMemcpy(const uint8_t *data)
{
uint8_t *dst = (uint8_t *)imageSurface->pixels;
int stride = imageSurface->w * 3;
SDL_LockSurface(imageSurface);
for( int y=0; y < imageSurface->h; y++ )
{
int invy = imageSurface->h - y - 1;
memcpy( dst + (stride * invy), data + (stride * y), stride );
}
SDL_UnlockSurface(imageSurface);
}
void blitData(const uint8_t *data, int useMode)
{
if( !imageSurface )
{
int w, h;
SDL_GetWindowSize(Window, &w, &h);
imageSurface = SDL_CreateRGBSurface(0, w, h, 24, 0xFF0000, 0x00FF00, 0x0000FF, 0x000000);
}
if( !imageSurface )
return;
struct timespec beforeBlit, afterBlit, blitDiff;
clock_gettime(CLOCK_REALTIME, &beforeBlit);
bool rval = (useMode == MODE_GPU || useMode == MODE_GPU_MEMCPY) ? setupCuda(imageSurface->w, imageSurface->h) : false;
if( rval == true && useMode == MODE_GPU )
blitDataCuda(data, false);
else if( rval == true && useMode == MODE_GPU_MEMCPY )
blitDataCuda(data, true);
else if( useMode == MODE_CPU_MEMCPY )
blitDataCPUMemcpy(data);
else if( useMode == MODE_CPU )
blitDataCPU((COPYCLASS *)data);
else
printf("No copy method available.\n");
clock_gettime(CLOCK_REALTIME, &afterBlit);
timespec_diff(&beforeBlit, &afterBlit, &blitDiff);
const char *target = (useMode == MODE_CPU || useMode == MODE_CPU_MEMCPY) ? "CPU" : "GPU";
const char *u_memcpy = (useMode == MODE_CPU_MEMCPY || useMode == MODE_GPU_MEMCPY) ? "Memcpy" : "Interp";
printf("Flip (%s %s): %ld.%09lds\n", target, u_memcpy, blitDiff.tv_sec, blitDiff.tv_nsec);
if( PrimarySurface )
SDL_BlitSurface(imageSurface, NULL, PrimarySurface, NULL);
SDL_UpdateWindowSurface(Window);
}
void shutdownSDL()
{
if(Window) {
SDL_DestroyWindow(Window);
Window = NULL;
}
if( imageSurface )
{
SDL_FreeSurface(imageSurface);
imageSurface = NULL;
}
SDL_Quit();
}
bool renderSDL()
{
SDL_Event Event;
while(SDL_PollEvent(&Event) != 0) {
if(Event.type == SDL_QUIT)
{
fprintf(stderr, "Quit\n");
Running = false;
}
}
if( Running )
SDL_Delay(1);
else {
shutdownSDL();
freeCuda();
}
return Running;
}
uint16_t ConvertRGB888toRGB565(uint32_t sourceColor)
{
unsigned int red = (sourceColor & 0x00FF0000) >> 16;
unsigned int green = (sourceColor & 0x0000FF00) >> 8;
unsigned int blue = sourceColor & 0x000000FF;
return (red >> 3 << 11) + (green >> 2 << 5) + (blue >> 3);
}
#if 0
int main()
{
uint32_t argb32 = 0xFFB57FE3;
uint8_t r8 = (argb32 & 0x00FF0000) >> 16; // 0xB5;
uint8_t g8 = (argb32 & 0x0000FF00) >> 8; // 0x7F;
uint8_t b8 = (argb32 & 0x000000FF); // 0xE3;
uint16_t rgb16;
uint8_t r5, g6, b5;
r5 = r8 >> 3;
g6 = g8 >> 2;
b5 = b8 >> 3;
rgb16 = (r5 << 11);
rgb16 += (g6 << 5);
rgb16 += b5;
printf("RGB888 0x%x%x%x/%u,%u,%u => RGB16 %d/%d/%d => %u/%04x\n", r8, g8, b8, r8, g8, b8, r5, g6, b5, rgb16, rgb16);
printf("RGB888 0x%x%x%x => RGB16 %u\n", r8, g8, b8, ConvertRGB888toRGB565( 0xB57FE3 ) );
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment