hamsham/cuda_raymarcher.cu

## cuda_raymarcher.cu

/*
 * A small raymarcher using CUDA
 *
 * This program renders a dynamically-generated image to a PPM file.
 *
 * nvcc cuda_raymarcher.cu -arch=sm_20 -o cuda_raymarcher
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>

#include <cuda_runtime.h>

#include "cuda_math.h"


#define EPSILON 1.0e-5

#define MAX_STEPS 128
#define MAX_DIST 100.f
#define MIN_DIST EPSILON

#define LIGHT_INTENSITY 1.f

#define CAM_NEAR 0.1f


__device__ unsigned calc_pixel_index()
{
    const unsigned blockId = blockIdx.x + blockIdx.y * gridDim.x;
    return blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
}


__device__ float2 calc_pixel_pos(const unsigned w, const unsigned h, const unsigned arrayId)
{
    const float wf = (float)w;
    const float hf = (float)h;
    const unsigned x = arrayId / h;
    const unsigned y = arrayId - (x * w);

    float2 ret = {(float)x, (float)y};
    ret.x = 2.f * (ret.x / wf) - 1.f;
    ret.y = 2.f * (ret.y / hf) - 1.f;

    return ret;
}


__device__ float calc_scene_distance(const float3 pos)
{
    // Sphere
	//const float radius = 0.5f;
    //return length_vec3f(pos)-radius;

    // Torus
    const float2 torus = {0.35f, 0.1f};
    const float dist = length_vec2f(fill2_vec2f(pos.x, pos.z));
    const float2 width = fill2_vec2f(dist-torus.x, pos.y);
    return length_vec2f(width)-torus.y;
}


__device__ float get_diffuse_intensity(const float3 vertNorm, const float3 lightDir) {
    // All color values must be normalized between (0, 1), they cannot be within (-1, 1).
    const float brightness = dot_vec3f(vertNorm, lightDir);
    return 0.5f * (brightness * LIGHT_INTENSITY) + 0.5f;
}


__device__ float3 calc_scene_normal(const float3 point)
{
    const float epsilon = EPSILON;

    const float3 px = {point.x-epsilon, point.y,            point.z};
    const float3 py = {point.x,         point.y-epsilon,    point.z};
    const float3 pz = {point.x,         point.y,            point.z-epsilon};

    const float dist0 = calc_scene_distance(point);

    const float3 norm =
    {
        calc_scene_distance(px) - dist0,
        calc_scene_distance(py) - dist0,
        calc_scene_distance(pz) - dist0
    };

    return normalize_vec3f(norm);
}


__device__ float3 shade_scene(const float3 eye, const float3 ray, const float dist)
{
    const float3 point = add_vec3f(eye, mul_vec3f(ray, fill_vec3f(dist)));
    const float3 norm = calc_scene_normal(point);
    const float3 lightPos = {3.f, 3.f, 3.f}; // TODO: kill the magic number

    float3 lightDir = normalize_vec3f(sub_vec3f(point, lightPos));

    return fill_vec3f(get_diffuse_intensity(norm, lightDir));
}


// Basic Ray-marching equation.
__device__ float raymarch(const float3 origin, const float3 direction)
{

    const unsigned maxMarches = MAX_STEPS;
	const float maxDist = MAX_DIST;
	const float minDist = MIN_DIST;
    float dist = 0.f;
    unsigned i;

    for (i = 0; i < maxMarches; ++i)
    {
        const float3 point = add_vec3f(origin, mul_vec3f(direction, fill_vec3f(dist)));
        const float scene = calc_scene_distance(point);

        if (scene < minDist)
        {
            return dist;
        }

        dist += scene;

        if (dist > maxDist)
        {
            return maxDist;
        }
    }

    return maxDist;
}


__global__ void render_sphere(const unsigned w, const unsigned h, float3* const pPixels)
{
    const unsigned pixelId  = calc_pixel_index();
    const float2 pixelXY    = calc_pixel_pos(w, h, pixelId);

    const float3 eye        = {0.f, 1.f, -1.f};
    const float3 fwd        = {0.f, -1.f, 1.f};
    const float3 up         = {0.f, 1.f, 0.f};
    const float3 right      = cross_vec3f(fwd, up);

    const float3 r          = mul_vec3f(right, fill_vec3f(pixelXY.x));
    const float3 u          = mul_vec3f(up, fill_vec3f(-pixelXY.y));
    const float3 ray        = normalize_vec3f(add_vec3f(add_vec3f(r, u), fwd));

    const float dist        = raymarch(eye, ray);
    float3* pPixel          = NULL;

    if (pixelId >= w*h)
    {
        return;
    }

    pPixel = &pPixels[pixelId];

    if (dist < MAX_DIST)
    {
        *pPixel = shade_scene(eye, ray, dist);
    }
    else
    {
        pPixel->x = 0.1f;
        pPixel->y = 0.2f;
        pPixel->z = 0.3f;
    }

    /* pixel values need to remain between (0, 1) for proper conversion to unsigned chars. */
    pPixel->x = clampf(pPixel->x, 0.f, 1.f);
    pPixel->y = clampf(pPixel->y, 0.f, 1.f);
    pPixel->z = clampf(pPixel->z, 0.f, 1.f);
}


__host__ uchar3 float3_to_uchar3(const float3* const pPixel)
{
    uchar3 ret;

    ret.x = (unsigned char)floorf((pPixel->x * 255.f));
    ret.y = (unsigned char)floorf((pPixel->y * 255.f));
    ret.z = (unsigned char)floorf((pPixel->z * 255.f));

    return ret;
}


__host__ int save_pixels_to_file(const unsigned w, const unsigned h, const float3* const pPixels, const char* const pFilename)
{
    int i, j;
    FILE* pFile = fopen(pFilename, "wb");

    if (!pFile)
    {
        return 0;
    }

    fprintf(pFile, "P6\n%u %u\n255\n", w, h);

    for (i = 0; i < h; ++i)
    {
        for(j = 0; j < w; ++j)
        {
            const uchar3 pixel = float3_to_uchar3(&pPixels[w * j + i]);
            const char pixelBuffer[3] = {pixel.x, pixel.y, pixel.z};
            fwrite(pixelBuffer, sizeof(char), sizeof(pixelBuffer), pFile);
        }
    }

    fclose(pFile);

    return 1;
}


int main()
{
    const unsigned w = 1024;
    const unsigned h = 1024;
    const unsigned numBytes = w * h * sizeof(float3);

    float3* hostPixels, *devicePixels;
    dim3 gridSize, blockSize;

    printf("\nAllocating %u bytes for host pixel storage...", numBytes);
    hostPixels = (float3*)malloc(numBytes);
    assert(hostPixels != NULL);
    memset(hostPixels, 0, numBytes);
    printf("Success!\n");

    printf("\nAllocating %u bytes for device processing storage...", numBytes);
    assert(cudaSuccess == cudaMalloc(&devicePixels, numBytes));
    assert(cudaSuccess == cudaMemset(devicePixels, 0, numBytes));
    printf("Success!\n");

    printf("\nProcessing pixel data on the GPU...");
    blockSize.x = 32;
    blockSize.y = 32;
    blockSize.z = 1;
    gridSize.x = (w + blockSize.x-1) / blockSize.x;
    gridSize.y = (h + blockSize.y-1) / blockSize.y;
    gridSize.z = 1;
    render_sphere<<< gridSize, blockSize >>>(w, h, devicePixels);
    printf("Done!\n");

    printf("\nWaiting for the GPU to finish its thang...");
    assert(cudaSuccess == cudaDeviceSynchronize());
    printf("Done!\n");

    printf("\nCopying pixels from the GPU to the host...");
    assert(cudaSuccess == cudaMemcpy(hostPixels, devicePixels, numBytes, cudaMemcpyDeviceToHost));
    printf("Done!\n");

    printf("\nSaving pixels to a PPM image file...");
    assert(0 != save_pixels_to_file(w, h, hostPixels, "cuda_sphere.ppm"));
    printf("Done!\n");

    printf("\nReleasing host and device memory...");
    free(hostPixels);
    cudaFree(devicePixels);
    printf("Done.\n\n");

    return 0;
}

	/*
	* A small raymarcher using CUDA
	*
	* This program renders a dynamically-generated image to a PPM file.
	*
	* nvcc cuda_raymarcher.cu -arch=sm_20 -o cuda_raymarcher
	*/

	#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <limits.h>

	#include <cuda_runtime.h>

	#include "cuda_math.h"



	#define EPSILON 1.0e-5

	#define MAX_STEPS 128
	#define MAX_DIST 100.f
	#define MIN_DIST EPSILON

	#define LIGHT_INTENSITY 1.f

	#define CAM_NEAR 0.1f



	__device__ unsigned calc_pixel_index()
	{
	const unsigned blockId = blockIdx.x + blockIdx.y * gridDim.x;
	return blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
	}



	__device__ float2 calc_pixel_pos(const unsigned w, const unsigned h, const unsigned arrayId)
	{
	const float wf = (float)w;
	const float hf = (float)h;
	const unsigned x = arrayId / h;
	const unsigned y = arrayId - (x * w);

	float2 ret = {(float)x, (float)y};
	ret.x = 2.f * (ret.x / wf) - 1.f;
	ret.y = 2.f * (ret.y / hf) - 1.f;

	return ret;
	}



	__device__ float calc_scene_distance(const float3 pos)
	{
	// Sphere
	//const float radius = 0.5f;
	//return length_vec3f(pos)-radius;

	// Torus
	const float2 torus = {0.35f, 0.1f};
	const float dist = length_vec2f(fill2_vec2f(pos.x, pos.z));
	const float2 width = fill2_vec2f(dist-torus.x, pos.y);
	return length_vec2f(width)-torus.y;
	}



	__device__ float get_diffuse_intensity(const float3 vertNorm, const float3 lightDir) {
	// All color values must be normalized between (0, 1), they cannot be within (-1, 1).
	const float brightness = dot_vec3f(vertNorm, lightDir);
	return 0.5f * (brightness * LIGHT_INTENSITY) + 0.5f;
	}



	__device__ float3 calc_scene_normal(const float3 point)
	{
	const float epsilon = EPSILON;

	const float3 px = {point.x-epsilon, point.y, point.z};
	const float3 py = {point.x, point.y-epsilon, point.z};
	const float3 pz = {point.x, point.y, point.z-epsilon};

	const float dist0 = calc_scene_distance(point);

	const float3 norm =
	{
	calc_scene_distance(px) - dist0,
	calc_scene_distance(py) - dist0,
	calc_scene_distance(pz) - dist0
	};

	return normalize_vec3f(norm);
	}



	__device__ float3 shade_scene(const float3 eye, const float3 ray, const float dist)
	{
	const float3 point = add_vec3f(eye, mul_vec3f(ray, fill_vec3f(dist)));
	const float3 norm = calc_scene_normal(point);
	const float3 lightPos = {3.f, 3.f, 3.f}; // TODO: kill the magic number

	float3 lightDir = normalize_vec3f(sub_vec3f(point, lightPos));

	return fill_vec3f(get_diffuse_intensity(norm, lightDir));
	}



	// Basic Ray-marching equation.
	__device__ float raymarch(const float3 origin, const float3 direction)
	{

	const unsigned maxMarches = MAX_STEPS;
	const float maxDist = MAX_DIST;
	const float minDist = MIN_DIST;
	float dist = 0.f;
	unsigned i;

	for (i = 0; i < maxMarches; ++i)
	{
	const float3 point = add_vec3f(origin, mul_vec3f(direction, fill_vec3f(dist)));
	const float scene = calc_scene_distance(point);

	if (scene < minDist)
	{
	return dist;
	}

	dist += scene;

	if (dist > maxDist)
	{
	return maxDist;
	}
	}

	return maxDist;
	}



	__global__ void render_sphere(const unsigned w, const unsigned h, float3* const pPixels)
	{
	const unsigned pixelId = calc_pixel_index();
	const float2 pixelXY = calc_pixel_pos(w, h, pixelId);

	const float3 eye = {0.f, 1.f, -1.f};
	const float3 fwd = {0.f, -1.f, 1.f};
	const float3 up = {0.f, 1.f, 0.f};
	const float3 right = cross_vec3f(fwd, up);

	const float3 r = mul_vec3f(right, fill_vec3f(pixelXY.x));
	const float3 u = mul_vec3f(up, fill_vec3f(-pixelXY.y));
	const float3 ray = normalize_vec3f(add_vec3f(add_vec3f(r, u), fwd));

	const float dist = raymarch(eye, ray);
	float3* pPixel = NULL;

	if (pixelId >= w*h)
	{
	return;
	}

	pPixel = &pPixels[pixelId];

	if (dist < MAX_DIST)
	{
	*pPixel = shade_scene(eye, ray, dist);
	}
	else
	{
	pPixel->x = 0.1f;
	pPixel->y = 0.2f;
	pPixel->z = 0.3f;
	}

	/* pixel values need to remain between (0, 1) for proper conversion to unsigned chars. */
	pPixel->x = clampf(pPixel->x, 0.f, 1.f);
	pPixel->y = clampf(pPixel->y, 0.f, 1.f);
	pPixel->z = clampf(pPixel->z, 0.f, 1.f);
	}



	__host__ uchar3 float3_to_uchar3(const float3* const pPixel)
	{
	uchar3 ret;

	ret.x = (unsigned char)floorf((pPixel->x * 255.f));
	ret.y = (unsigned char)floorf((pPixel->y * 255.f));
	ret.z = (unsigned char)floorf((pPixel->z * 255.f));

	return ret;
	}



	__host__ int save_pixels_to_file(const unsigned w, const unsigned h, const float3* const pPixels, const char* const pFilename)
	{
	int i, j;
	FILE* pFile = fopen(pFilename, "wb");

	if (!pFile)
	{
	return 0;
	}

	fprintf(pFile, "P6\n%u %u\n255\n", w, h);

	for (i = 0; i < h; ++i)
	{
	for(j = 0; j < w; ++j)
	{
	const uchar3 pixel = float3_to_uchar3(&pPixels[w * j + i]);
	const char pixelBuffer[3] = {pixel.x, pixel.y, pixel.z};
	fwrite(pixelBuffer, sizeof(char), sizeof(pixelBuffer), pFile);
	}
	}

	fclose(pFile);

	return 1;
	}



	int main()
	{
	const unsigned w = 1024;
	const unsigned h = 1024;
	const unsigned numBytes = w * h * sizeof(float3);

	float3* hostPixels, *devicePixels;
	dim3 gridSize, blockSize;

	printf("\nAllocating %u bytes for host pixel storage...", numBytes);
	hostPixels = (float3*)malloc(numBytes);
	assert(hostPixels != NULL);
	memset(hostPixels, 0, numBytes);
	printf("Success!\n");

	printf("\nAllocating %u bytes for device processing storage...", numBytes);
	assert(cudaSuccess == cudaMalloc(&devicePixels, numBytes));
	assert(cudaSuccess == cudaMemset(devicePixels, 0, numBytes));
	printf("Success!\n");

	printf("\nProcessing pixel data on the GPU...");
	blockSize.x = 32;
	blockSize.y = 32;
	blockSize.z = 1;
	gridSize.x = (w + blockSize.x-1) / blockSize.x;
	gridSize.y = (h + blockSize.y-1) / blockSize.y;
	gridSize.z = 1;
	render_sphere<<< gridSize, blockSize >>>(w, h, devicePixels);
	printf("Done!\n");

	printf("\nWaiting for the GPU to finish its thang...");
	assert(cudaSuccess == cudaDeviceSynchronize());
	printf("Done!\n");

	printf("\nCopying pixels from the GPU to the host...");
	assert(cudaSuccess == cudaMemcpy(hostPixels, devicePixels, numBytes, cudaMemcpyDeviceToHost));
	printf("Done!\n");

	printf("\nSaving pixels to a PPM image file...");
	assert(0 != save_pixels_to_file(w, h, hostPixels, "cuda_sphere.ppm"));
	printf("Done!\n");

	printf("\nReleasing host and device memory...");
	free(hostPixels);
	cudaFree(devicePixels);
	printf("Done.\n\n");

	return 0;
	}