BrentFarris/bake.bat

## bake.bat
ffmpeg -r 30 -f image2 -c:v ppm -s 640x480 -start_number 0 -i frame-%%d.ppm -vcodec libx264 -crf 25  -pix_fmt yuv420p baked.mp4

## old-cuda-trace-kernel.cu
#include "cuda_runtime.h"
#include "curand_kernel.h"
#include "sm_20_atomic_functions.h"
#include "device_launch_parameters.h"

#define _CRT_SECURE_NO_DEPRECATE
#define _USE_MATH_DEFINES

#define _USE_MATH_DEFINES
#include <math.h>
#include <time.h>
#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>

//#define WIDTH				200
//#define HEIGHT				100
//#define WIDTH				256
//#define HEIGHT				240
//#define WIDTH				320
//#define HEIGHT				240
//#define WIDTH				640
//#define HEIGHT				480
#define WIDTH				720
#define HEIGHT				480
//#define WIDTH				858
//#define HEIGHT				480
//#define WIDTH				1920
//#define HEIGHT				1080
#define PIXEL_DIM_SPLIT		2
#define PIXEL_SIZE			sizeof(struct v3) * WIDTH * HEIGHT
#define SAMPLES				16
#define MAX_DEPTH			50
#define RAND_COUNT_THREAD	8 * MAX_DEPTH
#define RANDS_COUNT			RAND_COUNT_THREAD * SAMPLES * PIXEL_DIM_SPLIT
#define RANDS_SIZE			sizeof(float) * RANDS_COUNT
#define MEM_LENGTH			WIDTH * HEIGHT * 3
#define SPHERE_COUNT		175
#define PPM_HEADER_FORMAT	"P3\n%d %d\n255\n"
#define FRAMES				120

#ifdef __cplusplus
extern "C" {
#endif
/************************************************************************/
/************************************************************************/
/* MTwister                                                             */
/************************************************************************/
/************************************************************************/
// https://github.com/ESultanik/mtwister
/* An implementation of the MT19937 Algorithm for the Mersenne Twister
	* by Evan Sultanik.  Based upon the pseudocode in: M. Matsumoto and
	* T. Nishimura, "Mersenne Twister: A 623-dimensionally
	* equidistributed uniform pseudorandom number generator," ACM
	* Transactions on Modeling and Computer Simulation Vol. 8, No. 1,
	* January pp.3-30 1998.
	*
	* http://www.sultanik.com/Mersenne_twister
	*/

#define STATE_VECTOR_LENGTH	624
#define STATE_VECTOR_SIZE	sizeof(uint64_t) * STATE_VECTOR_LENGTH
#define STATE_VECTOR_M		397 /* changes to STATE_VECTOR_LENGTH also require changes to this */

#define UPPER_MASK			0x80000000
#define LOWER_MASK			0x7fffffff
#define TEMPERING_MASK_B	0x9d2c5680
#define TEMPERING_MASK_C	0xefc60000

typedef struct tagMTRand {
	uint64_t* mt;
	int32_t index;
} MTRand;

__device__ __host__ static void m_seedRand(MTRand* rnd, uint64_t seed)
{
	/* set initial seeds to mt[STATE_VECTOR_LENGTH] using the generator
		* from Line 25 of Table 1 in: Donald Knuth, "The Art of Computer
		* Programming," Vol. 2 (2nd Ed.) pp.102.
		*/
	rnd->mt = (uint64_t*)malloc(STATE_VECTOR_SIZE);
	rnd->mt[0] = seed & 0xffffffff;
	for (rnd->index = 1; rnd->index < STATE_VECTOR_LENGTH; rnd->index++)
		rnd->mt[rnd->index] = (6069 * rnd->mt[rnd->index - 1]) & 0xffffffff;
}

/**
* Creates a new random number generator from a given seed.
*/
__device__ __host__ MTRand seed_rand(uint64_t seed)
{
	MTRand rnd;
	m_seedRand(&rnd, seed);
	return rnd;
}

__device__ __host__ void mt_rand_free(MTRand* rnd)
{
	free(rnd->mt);
}

MTRand make_rand_seed()
{
	time_t now;
	time(&now);
	return seed_rand(now);
}

/**
	* Generates a pseudo-randomly generated long.
	*/
__device__ __host__ uint64_t gen_rand_long(MTRand* rnd)
{
	uint64_t y;
	static uint64_t mag[2] = { 0x0, 0x9908b0df }; /* mag[x] = x * 0x9908b0df for x = 0,1 */
	if (rnd->index >= STATE_VECTOR_LENGTH || rnd->index < 0)
	{
		/* generate STATE_VECTOR_LENGTH words at a time */
		int32_t kk;
		if (rnd->index >= STATE_VECTOR_LENGTH + 1 || rnd->index < 0)
		{
			m_seedRand(rnd, 4357);
		}
		for (kk = 0; kk < STATE_VECTOR_LENGTH - STATE_VECTOR_M; kk++)
		{
			y = (rnd->mt[kk] & UPPER_MASK) | (rnd->mt[kk + 1] & LOWER_MASK);
			rnd->mt[kk] = rnd->mt[kk + STATE_VECTOR_M] ^ (y >> 1) ^ mag[y & 0x1];
		}
		for (; kk < STATE_VECTOR_LENGTH - 1; kk++)
		{
			y = (rnd->mt[kk] & UPPER_MASK) | (rnd->mt[kk + 1] & LOWER_MASK);
			rnd->mt[kk] = rnd->mt[kk + (STATE_VECTOR_M - STATE_VECTOR_LENGTH)] ^ (y >> 1) ^ mag[y & 0x1];
		}
		y = (rnd->mt[STATE_VECTOR_LENGTH - 1] & UPPER_MASK) | (rnd->mt[0] & LOWER_MASK);
		rnd->mt[STATE_VECTOR_LENGTH - 1] = rnd->mt[STATE_VECTOR_M - 1] ^ (y >> 1) ^ mag[y & 0x1];
		rnd->index = 0;
	}
	y = rnd->mt[rnd->index++];
	y ^= (y >> 11);
	y ^= (y << 7) & TEMPERING_MASK_B;
	y ^= (y << 15) & TEMPERING_MASK_C;
	y ^= (y >> 18);
	return y;
}

/**
	* Generates a pseudo-randomly generated double in the range [0..1].
	*/
__device__ __host__ double gen_rand(MTRand* rnd)
{
	double val = 0.0 + gen_rand_long(rnd);
	return(val / (double)0xffffffff);
}

__device__ __host__ int32_t rand_int32(MTRand* rnd, int32_t min, int32_t max)
{
	uint64_t r = gen_rand_long(rnd);
	int32_t q = r % INT32_MAX;
	if (min < 0)
	{
		if (r <= UINT64_MAX / 2)
			q *= -1;
	}
	return q % (max - min) + min;
}

__device__ __host__ float rand_float(MTRand* rnd, float min, float max)
{
	return (float)gen_rand(rnd) * (max - min) + min;
}

/************************************************************************/
/************************************************************************/
/* V3                                                                   */
/************************************************************************/
/************************************************************************/
typedef struct v3 {
	union {
		float x;
		float r;
	};
	union {
		float y;
		float g;
	};
	union {
		float z;
		float b;
	};
} v3;

__device__ __host__ static v3 v3_zero()
{
	v3 v;
	v.x = 0.0F;
	v.y = 0.0F;
	v.z = 0.0F;
	return v;
}
__device__ __host__ static v3 v3_one()
{
	v3 v;
	v.x = 1.0F;
	v.y = 1.0F;
	v.z = 1.0F;
	return v;
}
__device__ __host__ static v3 v3_up()
{
	v3 v;
	v.x = 0.0F;
	v.y = 1.0F;
	v.z = 0.0F;
	return v;
}
__device__ __host__ static v3 v3_get(const float x, const float y, const float z)
{
	v3 v;
	v.x = x;
	v.y = y;
	v.z = z;
	return v;
}

__device__ __host__ void v3_load(const v3 from, v3* to)
{
	to->x = from.x;
	to->y = from.y;
	to->z = from.z;
}

__device__ __host__ void v3_copy(const v3 from, v3* to)
{
	to->x = from.x;
	to->y = from.y;
	to->z = from.z;
}

__device__ __host__ void v3_add(v3* lhs, const v3 rhs)
{
	lhs->x += rhs.x;
	lhs->y += rhs.y;
	lhs->z += rhs.z;
}

__device__ __host__ void v3_subtract(v3* lhs, const v3 rhs)
{
	lhs->x -= rhs.x;
	lhs->y -= rhs.y;
	lhs->z -= rhs.z;
}

__device__ __host__ void v3_multiply(v3* lhs, const v3 rhs)
{
	lhs->x *= rhs.x;
	lhs->y *= rhs.y;
	lhs->z *= rhs.z;
}

__device__ __host__ void v3_divide(v3* lhs, const v3 rhs)
{
	lhs->x /= rhs.x;
	lhs->y /= rhs.y;
	lhs->z /= rhs.z;
}

__device__ __host__ void v3_add_using(v3* lhs, const float x, const float y, const float z)
{
	lhs->x += x;
	lhs->y += y;
	lhs->z += z;
}

__device__ __host__ void v3_subtract_using(v3* lhs, const float x, const float y, const float z)
{
	lhs->x -= x;
	lhs->y -= y;
	lhs->z -= z;
}

__device__ __host__ v3 v3_add_to(const v3 a, const v3 b)
{
	v3 v;
	v.x = a.x + b.x;
	v.y = a.y + b.y;
	v.z = a.z + b.z;
	return v;
}

__device__ __host__ v3 v3_subtract_to(const v3 a, const v3 b)
{
	v3 v;
	v.x = a.x - b.x;
	v.y = a.y - b.y;
	v.z = a.z - b.z;
	return v;
}

__device__ __host__ void v3_scale(v3* from, const float scale)
{
	from->x *= scale;
	from->y *= scale;
	from->z *= scale;
}

__device__ __host__ v3 v3_scale_from(const v3 vector, const float scale)
{
	v3 v;
	v.x = vector.x * scale;
	v.y = vector.y * scale;
	v.z = vector.z * scale;
	return v;
}

__device__ __host__ v3 v3_scale_to(const v3 vector, const float scale)
{
	v3 v;
	v.x = vector.x * scale;
	v.y = vector.y * scale;
	v.z = vector.z * scale;
	return v;
}

__device__ __host__ float v3_magnitude(const v3 vector)
{
	return sqrtf((vector.x * vector.x) + (vector.y * vector.y) + (vector.z * vector.z));
}

__device__ __host__ float v3_distance(const v3 a, const v3 b)
{
	return v3_magnitude(v3_subtract_to(a, b));
}

__device__ __host__ void v3_normalize(v3* vector)
{
	float mag = v3_magnitude(*vector);
	vector->x /= mag;
	vector->y /= mag;
	vector->z /= mag;
}

__device__ __host__ v3 v3_normalized(const v3 vector)
{
	float mag = v3_magnitude(vector);
	if (mag != 0)
	{
		v3 copy = vector;
		copy.x /= mag;
		copy.y /= mag;
		copy.z /= mag;
		return copy;
	}
	return v3_zero();
}

__device__ __host__ float v3_dot(const v3 a, const v3 b)
{
	return (a.x * b.x) + (a.y * b.y) + (a.z * b.z);
}

__device__ __host__ float v3_dot_from(const v3 a, const v3 b)
{
	return (a.x * b.x) + (a.y * b.y) + (a.z * b.z);
}

__device__ __host__ v3 v3_cross(const v3 a, const v3 b)
{
	v3 cross;
	cross.x = (a.y * b.z) - (a.z * b.y);
	cross.y = (a.z * b.x) - (a.x * b.z);
	cross.z = (a.x * b.y) - (a.y * b.x);
	return cross;
}

__device__ __host__ v3 v3_unit(const v3 vec)
{
	v3 out = vec;
	float len = 1.0F / v3_magnitude(vec);
	v3_scale(&out, len);
	return out;
}

/************************************************************************/
/************************************************************************/
/* Ray                                                                  */
/************************************************************************/
/************************************************************************/
typedef struct Ray {
	v3 ori;
	v3 dir;
} Ray;

__device__ __host__ static Ray ray_get(const v3 origin, const v3 direction)
{
	Ray r;
	r.ori = origin;
	r.dir = direction;
	return r;
}
__device__ __host__ static v3 ray_origin(const Ray* ray)
{
	return ray->ori;
}
__device__ __host__ static v3 ray_direction(const Ray* ray)
{
	return ray->dir;
}

__device__ __host__ v3 ray_point(const Ray* ray, const float len)
{
	/*--- origin + length * direction -------------------------------------*/
	v3 p = ray->dir;
	v3_scale(&p, len);
	v3_add(&p, ray->ori);
	return p;
}

/************************************************************************/
/************************************************************************/
/* Communication structures                                             */
/************************************************************************/
/************************************************************************/
struct ColorPick {
	v3 color;
	Ray scatter;
	uint16_t depth;
	bool done;
};
struct DeviceRands {
	const float* nums;
	size_t idx;
};

/************************************************************************/
/************************************************************************/
/* Camera                                                               */
/************************************************************************/
/************************************************************************/
struct Camera {
	v3 origin;
	v3 lowerLeft;
	v3 horizontal;
	v3 vertical;
	v3 u;
	v3 v;
	v3 w;
	float lensRadius;
};

void Camera_set_view(struct Camera* cam, const v3 origin, const v3 lookAt, const v3 up,
	const float fov, const float aspect, const float focalDistance)
{
	const float theta = (float)(fov * M_PI / 180.0F);
	const float halfHeight = tanf(theta / 2.0F);
	const float halfWidth = aspect * halfHeight;
	cam->origin = origin;
	cam->w = v3_unit(v3_subtract_to(origin, lookAt));
	cam->u = v3_unit(v3_cross(up, cam->w));
	cam->v = v3_cross(cam->w, cam->u);
	//cam->lowerLeft = v3_get(-halfWidth, -halfHeight, -1.0F);
	cam->lowerLeft = cam->origin;
	v3_subtract(&cam->lowerLeft, v3_scale_to(cam->u, halfWidth * focalDistance));
	v3_subtract(&cam->lowerLeft, v3_scale_to(cam->v, halfHeight * focalDistance));
	v3_subtract(&cam->lowerLeft, v3_scale_to(cam->w, focalDistance));
	cam->horizontal = cam->u;
	v3_scale(&cam->horizontal, halfWidth * 2.0F * focalDistance);
	cam->vertical = cam->v;
	v3_scale(&cam->vertical, halfHeight * 2.0F * focalDistance);
}

struct Camera* Camera_new(const v3 origin, const v3 lookAt, const v3 up, const float fov, const float aspect,
	const float aperture, const float focalDistance)
{
	struct Camera* cam = (struct Camera*)calloc(1, sizeof(struct Camera));
	cam->lensRadius = aperture / 2.0F;
	Camera_set_view(cam, origin, lookAt, up, fov, aspect, focalDistance);
	return cam;
}

void Camera_free(struct Camera* cam)
{
	free(cam);
}

__device__ static v3 local_random_unit_disk(struct DeviceRands* rng)
{
	v3 p = v3_get(rng->nums[rng->idx++], rng->nums[rng->idx++], 0.0F);
	v3_scale(&p, 2.0F);
	v3_subtract(&p, v3_get(1.0F, 1.0F, 0.0F));
	if (v3_dot(p, p) >= 1.0F)
		v3_scale(&p, 0.5F);
	return p;
}

__device__ Ray Camera_ray(struct Camera* cam, const float u, const float v, struct DeviceRands* rng)
{
	v3 r = v3_scale_to(local_random_unit_disk(rng), cam->lensRadius);
	v3 offset = v3_scale_to(cam->u, r.x);
	v3_add(&offset, v3_scale_to(cam->v, r.y));
	v3 dir = cam->lowerLeft;
	v3_add(&dir, v3_scale_to(cam->horizontal, u));
	v3_add(&dir, v3_scale_to(cam->vertical, v));
	v3 oo = v3_add_to(cam->origin, offset);
	v3_subtract(&dir, oo);
	return ray_get(oo, dir);
}

/************************************************************************/
/************************************************************************/
/* Hit                                                                  */
/************************************************************************/
/************************************************************************/
struct Material;
typedef struct Hit {
	v3 p;		// Point
	v3 nml;		// Normal
	float t;	// Time (for like motion blur or w/e)
	const struct Material* mat;
} Hit;

/************************************************************************/
/************************************************************************/
/* Material                                                             */
/************************************************************************/
/************************************************************************/
struct Material {
	v3 albedo;
	float fuzz;
	float refractIdx;
	int32_t scatterType;
};

__device__ static v3 rnd_unit_sphere_point(struct DeviceRands* rng)
{
	v3 p = v3_get(rng->nums[rng->idx++], rng->nums[rng->idx++], rng->nums[rng->idx++]);
	if (v3_magnitude(p) > 0.5F)
		v3_scale(&p, 0.5F);
	return p;
}

__device__ static float schlick(const float cosine, const float refractIdx)
{
	// https://en.wikipedia.org/wiki/Schlick%27s_approximation
	float r0 = (1 - refractIdx) / (1 + refractIdx);
	r0 = r0 * r0;
	return r0 + (1.0F - r0) * powf((1.0F - cosine), 5.0f);
}

__device__ static v3 reflect(const v3 v, const v3 n)
{
	const float dot = v3_dot(v, n);
	v3 r = v3_scale_to(n, dot * 2.0F);
	return v3_subtract_to(v, r);
}

__device__ static bool refract(const v3 v, const v3 n, const float niOverNt, v3* outRefracted)
{
	v3 uv = v3_unit(v);
	const float dt = v3_dot(uv, n);
	const float discriminant = 1.0F - niOverNt * niOverNt * (1.0F - dt * dt);
	if (discriminant > 0.0F)
	{
		/*--- niOverNt * (uv - n * dt) -----------------------------------------*/
		v3 l = v3_scale_to(v3_subtract_to(uv, v3_scale_to(n, dt)), niOverNt);
		/*--- n * sqrt(discriminant) -------------------------------------------*/
		v3 r = v3_scale_to(n, sqrtf(discriminant));
		*outRefracted = v3_subtract_to(l, r);
		return true;
	}
	return false;
}

__device__ bool mat_scatter_lambert(const Hit* hit, v3* outAttenuation, Ray* outRay, struct DeviceRands* rng)
{
	// TODO:  Possibly can scatter with some probability and then attenuation could be albedo/probability
	v3 target = v3_add_to(hit->p, hit->nml);
	v3_add(&target, rnd_unit_sphere_point(rng));
	*outRay = ray_get(hit->p, v3_subtract_to(target, hit->p));
	*outAttenuation = hit->mat->albedo;
	return true;
}

__device__ bool mat_scatter_metal(const Hit* hit, const Ray* ray, v3* outAttenuation, Ray* outRay, struct DeviceRands* rng)
{
	v3 reflected = reflect(v3_unit(ray_direction(ray)), hit->nml);
	v3 dir = v3_scale_to(rnd_unit_sphere_point(rng), hit->mat->fuzz);
	v3_add(&dir, reflected);
	*outRay = ray_get(hit->p, dir);
	*outAttenuation = hit->mat->albedo;
	return v3_dot(ray_direction(outRay), hit->nml) > 0.0F;
}

__device__ bool mat_scatter_dielectric(const Hit* hit, const Ray* ray, v3* outAttenuation, Ray* outRay, struct DeviceRands* rng)
{
	v3 oNorm;
	float niOverNt;
	float reflectProb;
	float cosine;

	/*--- This const value because glass doesn't absorb --------------------*/
	*outAttenuation = v3_get(1.0F, 1.0F, 1.0F);
	if (v3_dot(ray_direction(ray), hit->nml) > 0.0F)
	{
		oNorm = v3_scale_to(hit->nml, -1.0F);
		niOverNt = hit->mat->refractIdx;
		//cosine = mat->refractIdx * v3_dot(ray_direction(ray), hit->nml) / v3_magnitude(ray_direction(ray));
		cosine = v3_dot(ray_direction(ray), hit->nml) / v3_magnitude(ray_direction(ray));
		cosine = sqrtf(1.0F - hit->mat->refractIdx * hit->mat->refractIdx * (1.0F - cosine * cosine));
	}
	else
	{
		oNorm = hit->nml;
		niOverNt = 1.0F / hit->mat->refractIdx;
		cosine = -v3_dot(ray_direction(ray), hit->nml) / v3_magnitude(ray_direction(ray));
	}
	v3 refracted;
	v3 reflected = reflect(ray_direction(ray), hit->nml);
	if (refract(ray_direction(ray), oNorm, niOverNt, &refracted))
		reflectProb = schlick(cosine, hit->mat->refractIdx);
	else
		reflectProb = 1.0F;
	if (rng->nums[rng->idx++] < reflectProb)
		*outRay = ray_get(hit->p, reflected);
	else
		*outRay = ray_get(hit->p, refracted);
	return true;
}

struct Material mat_lambert(const v3 albedo)
{
	struct Material m;
	m.albedo = albedo;
	m.scatterType = 1;
	return m;
}

struct Material mat_metal(const v3 albedo, const float fuzz)
{
	struct Material m;
	m.albedo = albedo;
	m.fuzz = fuzz;
	m.scatterType = 2;
	return m;
}

struct Material mat_glass(const float refractIndex)
{
	struct Material m;
	m.refractIdx = refractIndex;
	m.scatterType = 3;
	return m;
}

/************************************************************************/
/************************************************************************/
/* Sphere                                                               */
/************************************************************************/
/************************************************************************/
struct Sphere {
	v3 center;
	float radius;
	struct Material mat;
};

struct Sphere sphere_get(const v3 center, const float radius, const struct Material mat)
{
	struct Sphere s;
	s.center = center;
	s.radius = radius;
	s.mat = mat;
	return s;
}

__device__ bool sphere_hit(const struct Sphere* sphere, const Ray* ray, const float tMin, float tMax, Hit* outHit)
{
	v3 oc = v3_subtract_to(ray_origin(ray), sphere->center);
	float a = v3_dot(ray_direction(ray), ray_direction(ray));
	float b = v3_dot(oc, ray_direction(ray));
	float c = v3_dot(oc, oc) - sphere->radius * sphere->radius;
	float discriminant = b * b - a * c;
	if (discriminant > 0.0F)
	{
		float t = 0.0F;
		float t0 = (-b - sqrtf(discriminant)) / a;
		float t1 = (-b + sqrtf(discriminant)) / a;
		if (t0 < tMax && t0 > tMin)
			t = t0;
		else if (t1 < tMax && t1 > tMin)
			t = t1;
		else
			return false;
		outHit->t = t;
		outHit->p = ray_point(ray, t);
		outHit->nml = outHit->p;
		v3_subtract(&outHit->nml, sphere->center);
		v3_scale(&outHit->nml, 1.0F / sphere->radius);
		outHit->mat = &sphere->mat;
		return true;
	}
	return false;
}

#ifdef __cplusplus
}
#endif
/************************************************************************/
/************************************************************************/
/* Global                                                               */
/************************************************************************/
/************************************************************************/
__device__ static void color(Ray* ray, struct Sphere* spheres, const size_t sphereCount, struct DeviceRands* rng, struct ColorPick* pick)
{
	Hit hit;
	hit.t = FLT_MAX;
	for (size_t i = 0; i < sphereCount; ++i)
	{
		Hit check;
		check.t = FLT_MAX;
		sphere_hit(spheres + i, ray, 0.001F, FLT_MAX, &check);
		/*--- Near 0 should be ignored, thus 0.001F ----------------------------*/
		if (check.t < hit.t)
			hit = check;
	}
	if (hit.t < FLT_MAX)
	{
		Ray scattered;
		v3 attenuation;
		bool valid = false;
		switch (hit.mat->scatterType)
		{
			case 1:
				valid = mat_scatter_lambert(&hit, &attenuation, &scattered, rng);
				break;
			case 2:
				valid = mat_scatter_metal(&hit, ray, &attenuation, &scattered, rng);
				break;
			case 3:
				valid = mat_scatter_dielectric(&hit, ray, &attenuation, &scattered, rng);
				break;
		}
		if (valid)
		{
			pick->scatter = scattered;
			v3_multiply(&pick->color, attenuation);
			return;
		}
		else
		{
			pick->color = v3_zero();
			pick->done = true;
			return;
		}
	}
	// 1 = blue & 0 = white
	v3 unit = v3_unit(ray_direction(ray));	// -1 < y < 1
	float t = 0.5F * (unit.y + 1.0F);		// Scale above to 0 < y < 1
	/*--- ((1.0 - t) * <1.0,1.0,1.0>) + (t * <0.5,0.7,0.1>) ----------------*/
	v3 start = v3_one();
	v3 end = v3_get(0.5F, 0.7F, 1.0F);
	v3_scale(&end, t);
	v3_scale(&start, 1.0F - t);
	v3_add(&start, end);
	v3_multiply(&pick->color, start);
	pick->done = true;
}

static void write_image(const uint8_t* buff, const size_t len, const int32_t width, const int32_t height, const int32_t frameIdx)
{
	// TODO:  The buf should be from float[] to uint8_t[]

	assert(len > 0);
	assert(width > 0);
	assert(height > 0);
	assert(len % 3 == 0);
	char name[128];
	snprintf(name, 128, "frame-%d.ppm", frameIdx);
	FILE* fp = fopen(name, "w");
	if (fp == NULL)
		return;

	char header[sizeof(PPM_HEADER_FORMAT) + 20];
	snprintf(header, sizeof(header), PPM_HEADER_FORMAT, width, height);
	fwrite(header, strlen(header), 1, fp);

	char rgb[4];
	const size_t colCount = (size_t)width * 3;
	for (size_t i = 0; i < len;)
	{
		for (size_t c = 0; c < colCount; ++c, ++i)
		{
			_itoa(buff[i], rgb, 10);
			fwrite(rgb, strlen(rgb), 1, fp);
			fwrite(" ", 1, 1, fp);
		}
		fwrite("\n", 1, 1, fp);
	}

	fclose(fp);
}

static uint8_t fltoui8(const float val)
{
	return (uint8_t)(val * 255.99F);
}

static struct Sphere* get_spheres(const size_t count, size_t* outCount)
{
	MTRand r = make_rand_seed();
	MTRand* rnd = &r;

	struct Sphere* spheres = (struct Sphere*)malloc(sizeof(struct Sphere) * (count + 1));
	size_t i = 0;

	spheres[i++] = sphere_get(v3_get(0.0F, -1000.0F, 0.0F), 1000.0F, mat_lambert(v3_get(0.5F, 0.5F, 0.5F)));
	const v3 m = v3_get(4.0F, 0.2F, 0.0F);
	for (int32_t a = -5; a < 5; ++a)
	{
		for (int32_t b = -5; b < 5; ++b)
		{
			v3 center = v3_get(a + 0.9F * rand_float(rnd, 0.0F, 1.0F), 0.2F, b + 0.9F * rand_float(rnd, 0.0F, 1.0F));
			if (v3_magnitude(v3_subtract_to(center, m)) > 0.9F)
			{
				float matChoice = rand_float(rnd, 0.0F, 1.0F);
				if (matChoice < 0.8F)
					spheres[i++] = sphere_get(center, 0.2F, mat_lambert(v3_get(rand_float(rnd, 0.0F, 1.0F), rand_float(rnd, 0.0F, 1.0F), rand_float(rnd, 0.0F, 1.0F))));
				else if (matChoice < 0.95F)
					spheres[i++] = sphere_get(center, 0.2F, mat_metal(v3_get((1 + rand_float(rnd, 0.0F, 1.0F)) * 0.5F, (1 + rand_float(rnd, 0.0F, 1.0F)) * 0.5F, (1 + rand_float(rnd, 0.0F, 1.0F)) * 0.5F), rand_float(rnd, 0.0F, 1.0F) * 0.5F));
				else
					spheres[i++] = sphere_get(center, 0.2F, mat_glass(1.5F));
			}

			if (i == count - 3)
				break;
		}

		if (i == count - 3)
			break;
	}
	spheres[i++] = sphere_get(v3_get(0.0F, 1.0F, 0.0F), 1.0F, mat_glass(1.5F));
	spheres[i++] = sphere_get(v3_get(-4.0F, 1.0F, 0.0F), 1.0F, mat_lambert(v3_get(0.4F, 0.2F, 0.1F)));
	spheres[i++] = sphere_get(v3_get(4.0F, 1.0F, 0.0F), 1.0F, mat_metal(v3_get(0.7F, 0.6F, 0.5F), 0.0F));
	*outCount = i;
	mt_rand_free(rnd);
	return spheres;
}

__constant__ struct Sphere cuda_Spheres[500];

__global__ void dummy(struct Camera* cam, struct Sphere* spheres, size_t sphereCount, struct v3* rgb, const float* dRands)
{
	//extern __shared__ v3 samples[];
	int threadsPerBlock = blockDim.x * blockDim.y * blockDim.z;
	int blocksPerGrid = gridDim.x * gridDim.y * gridDim.z;
	int threadPositionInBlock = threadIdx.x +
		blockDim.x * threadIdx.y +
		blockDim.x * blockDim.y * threadIdx.z;
	int blockPosInGrid = blockIdx.x +
		gridDim.x * blockIdx.y +
		gridDim.x * gridDim.y * blockIdx.z;
	//int idx = threadPositionInBlock * blocksPerGrid + blockPosInGrid;
	//int idx = blockPosInGrid * threadsPerBlock + threadPositionInBlock;
	//int idx = blockIdx.y * gridDim.x + blockIdx.x;
	int h = (blockIdx.x * blockDim.y) + threadIdx.y;
	int idx = (blockIdx.y * gridDim.x * blockDim.y) + h;
	rgb[idx] = v3_zero();

	struct ColorPick pick;
	struct DeviceRands rng;
	rng.idx = 0;
	//int rndOffset = threadPositionInBlock * RAND_COUNT_THREAD;
	int rndOffset = threadIdx.x * RAND_COUNT_THREAD;
	//int rndOffset = (idx + threadIdx.x * RAND_COUNT_THREAD) % (RANDS_COUNT - RAND_COUNT_THREAD);
	rng.nums = dRands + rndOffset;
	const float u = ((float)h + rng.nums[rng.idx++]) / WIDTH;
	//const float u = ((float)blockIdx.x + rng.nums[rng.idx++]) / WIDTH;
	const float v = ((float)HEIGHT - blockIdx.y + rng.nums[rng.idx++]) / HEIGHT;
	pick.scatter = Camera_ray(cam, u, v, &rng);
	pick.done = false;
	pick.depth = 0;
	pick.color = v3_one();
	while (!pick.done && pick.depth < MAX_DEPTH)
	{
		color(&pick.scatter, cuda_Spheres, sphereCount, &rng, &pick);
		pick.depth++;
	}
	//samples[threadPositionInBlock] = pick.color;
	// Block for all threads and sum up shared values
	//__syncthreads();
	//if (threadPositionInBlock == 0)
	//{
	//	v3 c = v3_zero();
	//	for (int i = 0; i < SAMPLES; ++i)
	//		v3_add(&c, samples[i]);
	//	rgb[idx] = c;
	//	//v3_scale(&c, 1.0F / SAMPLES);
	//	//rgb[idx] = v3_get(sqrtf(c.r), sqrtf(c.g), sqrtf(c.b));
	//}
	// Say hello to atomics
	v3 c = pick.color;
	atomicAdd(&rgb[idx].r, c.x);
	atomicAdd(&rgb[idx].g, c.g);
	atomicAdd(&rgb[idx].b, c.b);
}

void render_frame(struct Sphere* spheres, size_t sphereCount)
{
	float scale = 1.0F / FRAMES;
	float cx = 8.0F;
	float cz = 10.0F;
	const float circle = (float)(M_PI * 2.0F);
	float angleDelta = circle * scale;
	float angle = 0.0F;

	struct Sphere* dSpheres;
	cudaMalloc(&dSpheres, sizeof(struct Sphere) * sphereCount);
	cudaMemcpy(dSpheres, spheres, sizeof(struct Sphere) * sphereCount, cudaMemcpyHostToDevice);

	struct Camera* dCamera;
	cudaMalloc(&dCamera, sizeof(struct Camera));

	struct v3* dCp;
	cudaMalloc(&dCp, PIXEL_SIZE);
	//cudaMallocManaged(&dCp, PIXEL_SIZE, cudaMemAttachGlobal);
	struct v3* cp;
	cudaMallocHost(&cp, PIXEL_SIZE);

	float* dRands;
	cudaMalloc(&dRands, RANDS_SIZE);

	curandGenerator_t gen;
	curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MT19937);

	dim3 block;
	block.x = SAMPLES;
	block.y = PIXEL_DIM_SPLIT;
	block.z = 1;

	dim3 grid;
	grid.x = WIDTH / PIXEL_DIM_SPLIT;
	grid.y = HEIGHT;
	grid.z = 1;

	const size_t sharedMemSize = sizeof(v3) * SAMPLES;

	const float fov = 20.0F;
	const float aspect = (float)WIDTH / (float)HEIGHT;
	const v3 lookAt = v3_zero();
	const float aperture = 0.1F;
	const float focalDist = 10.0F;
	uint8_t* data = (uint8_t*)malloc(MEM_LENGTH);
	struct Camera* cam = Camera_new(v3_get(cx, 2.0F, cz), lookAt, v3_up(), fov, aspect, aperture, focalDist);

	cudaMemcpyToSymbol(cuda_Spheres, spheres, sphereCount * sizeof(struct Sphere), 0, cudaMemcpyHostToDevice);

	curandGenerateUniform(gen, dRands, RANDS_COUNT);
	for (int32_t f = 0; f < FRAMES; ++f)
	{
		cam->origin.x = cx * cosf(angle) - cz * sinf(angle);
		cam->origin.z = cz * cosf(angle) + cx * sinf(angle);
		Camera_set_view(cam, cam->origin, lookAt, v3_up(), fov, aspect, focalDist);
		angle += angleDelta;

		cudaMemcpy(dCamera, cam, sizeof(struct Camera), cudaMemcpyHostToDevice);
		//curandGenerateNormal(gen, dRands, RANDS_COUNT, 0.5F, 0.5F);
		//dummy<<<grid, block, sharedMemSize>>>(dCamera, dSpheres, sphereCount, dCp, dRands, WIDTH * HEIGHT);
		dummy<<<grid, block>>>(dCamera, dSpheres, sphereCount, dCp, dRands);
		cudaDeviceSynchronize();

		cudaMemcpy(cp, dCp, PIXEL_SIZE, cudaMemcpyDeviceToHost);

		size_t writeIdx = 0;
		for (int32_t i = 0; i < WIDTH * HEIGHT; ++i)
		{
			v3 c = cp[i];
			v3_scale(&c, 1.0F / SAMPLES);
			c = v3_get(sqrtf(c.r), sqrtf(c.g), sqrtf(c.b));

			data[writeIdx++] = fltoui8(c.r);
			data[writeIdx++] = fltoui8(c.g);
			data[writeIdx++] = fltoui8(c.b);
		}
		write_image(data, MEM_LENGTH, WIDTH, HEIGHT, f);
	}
	curandDestroyGenerator(gen);
	cudaFree(dSpheres);
	cudaFree(dCamera);
	cudaFree(dCp);
	cudaFree(dRands);
	cudaFreeHost(cp);
	Camera_free(cam);
}

int main(void)
{
	size_t sphereCount = 0;
	struct Sphere* spheres = get_spheres(SPHERE_COUNT, &sphereCount);
	render_frame(spheres, sphereCount);
	return 0;
}