Last active
October 16, 2023 15:53
-
-
Save BrentFarris/062e974457932c49f19e901805e632fd to your computer and use it in GitHub Desktop.
Cuda code for https://retroscience.net/ray-tracing-with-cuda.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ffmpeg -r 30 -f image2 -c:v ppm -s 640x480 -start_number 0 -i frame-%%d.ppm -vcodec libx264 -crf 25 -pix_fmt yuv420p baked.mp4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_runtime.h" | |
#include "curand_kernel.h" | |
#include "sm_20_atomic_functions.h" | |
#include "device_launch_parameters.h" | |
#define _CRT_SECURE_NO_DEPRECATE | |
#define _USE_MATH_DEFINES | |
#define _USE_MATH_DEFINES | |
#include <math.h> | |
#include <time.h> | |
#include <float.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <assert.h> | |
#include <stdbool.h> | |
//#define WIDTH 200 | |
//#define HEIGHT 100 | |
//#define WIDTH 256 | |
//#define HEIGHT 240 | |
//#define WIDTH 320 | |
//#define HEIGHT 240 | |
//#define WIDTH 640 | |
//#define HEIGHT 480 | |
#define WIDTH 720 | |
#define HEIGHT 480 | |
//#define WIDTH 858 | |
//#define HEIGHT 480 | |
//#define WIDTH 1920 | |
//#define HEIGHT 1080 | |
#define PIXEL_DIM_SPLIT 2 | |
#define PIXEL_SIZE sizeof(struct v3) * WIDTH * HEIGHT | |
#define SAMPLES 16 | |
#define MAX_DEPTH 50 | |
#define RAND_COUNT_THREAD 8 * MAX_DEPTH | |
#define RANDS_COUNT RAND_COUNT_THREAD * SAMPLES * PIXEL_DIM_SPLIT | |
#define RANDS_SIZE sizeof(float) * RANDS_COUNT | |
#define MEM_LENGTH WIDTH * HEIGHT * 3 | |
#define SPHERE_COUNT 175 | |
#define PPM_HEADER_FORMAT "P3\n%d %d\n255\n" | |
#define FRAMES 120 | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
/************************************************************************/ | |
/************************************************************************/ | |
/* MTwister */ | |
/************************************************************************/ | |
/************************************************************************/ | |
// https://github.com/ESultanik/mtwister | |
/* An implementation of the MT19937 Algorithm for the Mersenne Twister | |
* by Evan Sultanik. Based upon the pseudocode in: M. Matsumoto and | |
* T. Nishimura, "Mersenne Twister: A 623-dimensionally | |
* equidistributed uniform pseudorandom number generator," ACM | |
* Transactions on Modeling and Computer Simulation Vol. 8, No. 1, | |
* January pp.3-30 1998. | |
* | |
* http://www.sultanik.com/Mersenne_twister | |
*/ | |
#define STATE_VECTOR_LENGTH 624 | |
#define STATE_VECTOR_SIZE sizeof(uint64_t) * STATE_VECTOR_LENGTH | |
#define STATE_VECTOR_M 397 /* changes to STATE_VECTOR_LENGTH also require changes to this */ | |
#define UPPER_MASK 0x80000000 | |
#define LOWER_MASK 0x7fffffff | |
#define TEMPERING_MASK_B 0x9d2c5680 | |
#define TEMPERING_MASK_C 0xefc60000 | |
typedef struct tagMTRand { | |
uint64_t* mt; | |
int32_t index; | |
} MTRand; | |
__device__ __host__ static void m_seedRand(MTRand* rnd, uint64_t seed) | |
{ | |
/* set initial seeds to mt[STATE_VECTOR_LENGTH] using the generator | |
* from Line 25 of Table 1 in: Donald Knuth, "The Art of Computer | |
* Programming," Vol. 2 (2nd Ed.) pp.102. | |
*/ | |
rnd->mt = (uint64_t*)malloc(STATE_VECTOR_SIZE); | |
rnd->mt[0] = seed & 0xffffffff; | |
for (rnd->index = 1; rnd->index < STATE_VECTOR_LENGTH; rnd->index++) | |
rnd->mt[rnd->index] = (6069 * rnd->mt[rnd->index - 1]) & 0xffffffff; | |
} | |
/** | |
* Creates a new random number generator from a given seed. | |
*/ | |
__device__ __host__ MTRand seed_rand(uint64_t seed) | |
{ | |
MTRand rnd; | |
m_seedRand(&rnd, seed); | |
return rnd; | |
} | |
__device__ __host__ void mt_rand_free(MTRand* rnd) | |
{ | |
free(rnd->mt); | |
} | |
MTRand make_rand_seed() | |
{ | |
time_t now; | |
time(&now); | |
return seed_rand(now); | |
} | |
/** | |
* Generates a pseudo-randomly generated long. | |
*/ | |
__device__ __host__ uint64_t gen_rand_long(MTRand* rnd) | |
{ | |
uint64_t y; | |
static uint64_t mag[2] = { 0x0, 0x9908b0df }; /* mag[x] = x * 0x9908b0df for x = 0,1 */ | |
if (rnd->index >= STATE_VECTOR_LENGTH || rnd->index < 0) | |
{ | |
/* generate STATE_VECTOR_LENGTH words at a time */ | |
int32_t kk; | |
if (rnd->index >= STATE_VECTOR_LENGTH + 1 || rnd->index < 0) | |
{ | |
m_seedRand(rnd, 4357); | |
} | |
for (kk = 0; kk < STATE_VECTOR_LENGTH - STATE_VECTOR_M; kk++) | |
{ | |
y = (rnd->mt[kk] & UPPER_MASK) | (rnd->mt[kk + 1] & LOWER_MASK); | |
rnd->mt[kk] = rnd->mt[kk + STATE_VECTOR_M] ^ (y >> 1) ^ mag[y & 0x1]; | |
} | |
for (; kk < STATE_VECTOR_LENGTH - 1; kk++) | |
{ | |
y = (rnd->mt[kk] & UPPER_MASK) | (rnd->mt[kk + 1] & LOWER_MASK); | |
rnd->mt[kk] = rnd->mt[kk + (STATE_VECTOR_M - STATE_VECTOR_LENGTH)] ^ (y >> 1) ^ mag[y & 0x1]; | |
} | |
y = (rnd->mt[STATE_VECTOR_LENGTH - 1] & UPPER_MASK) | (rnd->mt[0] & LOWER_MASK); | |
rnd->mt[STATE_VECTOR_LENGTH - 1] = rnd->mt[STATE_VECTOR_M - 1] ^ (y >> 1) ^ mag[y & 0x1]; | |
rnd->index = 0; | |
} | |
y = rnd->mt[rnd->index++]; | |
y ^= (y >> 11); | |
y ^= (y << 7) & TEMPERING_MASK_B; | |
y ^= (y << 15) & TEMPERING_MASK_C; | |
y ^= (y >> 18); | |
return y; | |
} | |
/** | |
* Generates a pseudo-randomly generated double in the range [0..1]. | |
*/ | |
__device__ __host__ double gen_rand(MTRand* rnd) | |
{ | |
double val = 0.0 + gen_rand_long(rnd); | |
return(val / (double)0xffffffff); | |
} | |
__device__ __host__ int32_t rand_int32(MTRand* rnd, int32_t min, int32_t max) | |
{ | |
uint64_t r = gen_rand_long(rnd); | |
int32_t q = r % INT32_MAX; | |
if (min < 0) | |
{ | |
if (r <= UINT64_MAX / 2) | |
q *= -1; | |
} | |
return q % (max - min) + min; | |
} | |
__device__ __host__ float rand_float(MTRand* rnd, float min, float max) | |
{ | |
return (float)gen_rand(rnd) * (max - min) + min; | |
} | |
/************************************************************************/ | |
/************************************************************************/ | |
/* V3 */ | |
/************************************************************************/ | |
/************************************************************************/ | |
typedef struct v3 { | |
union { | |
float x; | |
float r; | |
}; | |
union { | |
float y; | |
float g; | |
}; | |
union { | |
float z; | |
float b; | |
}; | |
} v3; | |
__device__ __host__ static v3 v3_zero() | |
{ | |
v3 v; | |
v.x = 0.0F; | |
v.y = 0.0F; | |
v.z = 0.0F; | |
return v; | |
} | |
__device__ __host__ static v3 v3_one() | |
{ | |
v3 v; | |
v.x = 1.0F; | |
v.y = 1.0F; | |
v.z = 1.0F; | |
return v; | |
} | |
__device__ __host__ static v3 v3_up() | |
{ | |
v3 v; | |
v.x = 0.0F; | |
v.y = 1.0F; | |
v.z = 0.0F; | |
return v; | |
} | |
__device__ __host__ static v3 v3_get(const float x, const float y, const float z) | |
{ | |
v3 v; | |
v.x = x; | |
v.y = y; | |
v.z = z; | |
return v; | |
} | |
__device__ __host__ void v3_load(const v3 from, v3* to) | |
{ | |
to->x = from.x; | |
to->y = from.y; | |
to->z = from.z; | |
} | |
__device__ __host__ void v3_copy(const v3 from, v3* to) | |
{ | |
to->x = from.x; | |
to->y = from.y; | |
to->z = from.z; | |
} | |
__device__ __host__ void v3_add(v3* lhs, const v3 rhs) | |
{ | |
lhs->x += rhs.x; | |
lhs->y += rhs.y; | |
lhs->z += rhs.z; | |
} | |
__device__ __host__ void v3_subtract(v3* lhs, const v3 rhs) | |
{ | |
lhs->x -= rhs.x; | |
lhs->y -= rhs.y; | |
lhs->z -= rhs.z; | |
} | |
__device__ __host__ void v3_multiply(v3* lhs, const v3 rhs) | |
{ | |
lhs->x *= rhs.x; | |
lhs->y *= rhs.y; | |
lhs->z *= rhs.z; | |
} | |
__device__ __host__ void v3_divide(v3* lhs, const v3 rhs) | |
{ | |
lhs->x /= rhs.x; | |
lhs->y /= rhs.y; | |
lhs->z /= rhs.z; | |
} | |
__device__ __host__ void v3_add_using(v3* lhs, const float x, const float y, const float z) | |
{ | |
lhs->x += x; | |
lhs->y += y; | |
lhs->z += z; | |
} | |
__device__ __host__ void v3_subtract_using(v3* lhs, const float x, const float y, const float z) | |
{ | |
lhs->x -= x; | |
lhs->y -= y; | |
lhs->z -= z; | |
} | |
__device__ __host__ v3 v3_add_to(const v3 a, const v3 b) | |
{ | |
v3 v; | |
v.x = a.x + b.x; | |
v.y = a.y + b.y; | |
v.z = a.z + b.z; | |
return v; | |
} | |
__device__ __host__ v3 v3_subtract_to(const v3 a, const v3 b) | |
{ | |
v3 v; | |
v.x = a.x - b.x; | |
v.y = a.y - b.y; | |
v.z = a.z - b.z; | |
return v; | |
} | |
__device__ __host__ void v3_scale(v3* from, const float scale) | |
{ | |
from->x *= scale; | |
from->y *= scale; | |
from->z *= scale; | |
} | |
__device__ __host__ v3 v3_scale_from(const v3 vector, const float scale) | |
{ | |
v3 v; | |
v.x = vector.x * scale; | |
v.y = vector.y * scale; | |
v.z = vector.z * scale; | |
return v; | |
} | |
__device__ __host__ v3 v3_scale_to(const v3 vector, const float scale) | |
{ | |
v3 v; | |
v.x = vector.x * scale; | |
v.y = vector.y * scale; | |
v.z = vector.z * scale; | |
return v; | |
} | |
__device__ __host__ float v3_magnitude(const v3 vector) | |
{ | |
return sqrtf((vector.x * vector.x) + (vector.y * vector.y) + (vector.z * vector.z)); | |
} | |
__device__ __host__ float v3_distance(const v3 a, const v3 b) | |
{ | |
return v3_magnitude(v3_subtract_to(a, b)); | |
} | |
__device__ __host__ void v3_normalize(v3* vector) | |
{ | |
float mag = v3_magnitude(*vector); | |
vector->x /= mag; | |
vector->y /= mag; | |
vector->z /= mag; | |
} | |
__device__ __host__ v3 v3_normalized(const v3 vector) | |
{ | |
float mag = v3_magnitude(vector); | |
if (mag != 0) | |
{ | |
v3 copy = vector; | |
copy.x /= mag; | |
copy.y /= mag; | |
copy.z /= mag; | |
return copy; | |
} | |
return v3_zero(); | |
} | |
__device__ __host__ float v3_dot(const v3 a, const v3 b) | |
{ | |
return (a.x * b.x) + (a.y * b.y) + (a.z * b.z); | |
} | |
__device__ __host__ float v3_dot_from(const v3 a, const v3 b) | |
{ | |
return (a.x * b.x) + (a.y * b.y) + (a.z * b.z); | |
} | |
__device__ __host__ v3 v3_cross(const v3 a, const v3 b) | |
{ | |
v3 cross; | |
cross.x = (a.y * b.z) - (a.z * b.y); | |
cross.y = (a.z * b.x) - (a.x * b.z); | |
cross.z = (a.x * b.y) - (a.y * b.x); | |
return cross; | |
} | |
__device__ __host__ v3 v3_unit(const v3 vec) | |
{ | |
v3 out = vec; | |
float len = 1.0F / v3_magnitude(vec); | |
v3_scale(&out, len); | |
return out; | |
} | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Ray */ | |
/************************************************************************/ | |
/************************************************************************/ | |
typedef struct Ray { | |
v3 ori; | |
v3 dir; | |
} Ray; | |
__device__ __host__ static Ray ray_get(const v3 origin, const v3 direction) | |
{ | |
Ray r; | |
r.ori = origin; | |
r.dir = direction; | |
return r; | |
} | |
__device__ __host__ static v3 ray_origin(const Ray* ray) | |
{ | |
return ray->ori; | |
} | |
__device__ __host__ static v3 ray_direction(const Ray* ray) | |
{ | |
return ray->dir; | |
} | |
__device__ __host__ v3 ray_point(const Ray* ray, const float len) | |
{ | |
/*--- origin + length * direction -------------------------------------*/ | |
v3 p = ray->dir; | |
v3_scale(&p, len); | |
v3_add(&p, ray->ori); | |
return p; | |
} | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Communication structures */ | |
/************************************************************************/ | |
/************************************************************************/ | |
struct ColorPick { | |
v3 color; | |
Ray scatter; | |
uint16_t depth; | |
bool done; | |
}; | |
struct DeviceRands { | |
const float* nums; | |
size_t idx; | |
}; | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Camera */ | |
/************************************************************************/ | |
/************************************************************************/ | |
struct Camera { | |
v3 origin; | |
v3 lowerLeft; | |
v3 horizontal; | |
v3 vertical; | |
v3 u; | |
v3 v; | |
v3 w; | |
float lensRadius; | |
}; | |
void Camera_set_view(struct Camera* cam, const v3 origin, const v3 lookAt, const v3 up, | |
const float fov, const float aspect, const float focalDistance) | |
{ | |
const float theta = (float)(fov * M_PI / 180.0F); | |
const float halfHeight = tanf(theta / 2.0F); | |
const float halfWidth = aspect * halfHeight; | |
cam->origin = origin; | |
cam->w = v3_unit(v3_subtract_to(origin, lookAt)); | |
cam->u = v3_unit(v3_cross(up, cam->w)); | |
cam->v = v3_cross(cam->w, cam->u); | |
//cam->lowerLeft = v3_get(-halfWidth, -halfHeight, -1.0F); | |
cam->lowerLeft = cam->origin; | |
v3_subtract(&cam->lowerLeft, v3_scale_to(cam->u, halfWidth * focalDistance)); | |
v3_subtract(&cam->lowerLeft, v3_scale_to(cam->v, halfHeight * focalDistance)); | |
v3_subtract(&cam->lowerLeft, v3_scale_to(cam->w, focalDistance)); | |
cam->horizontal = cam->u; | |
v3_scale(&cam->horizontal, halfWidth * 2.0F * focalDistance); | |
cam->vertical = cam->v; | |
v3_scale(&cam->vertical, halfHeight * 2.0F * focalDistance); | |
} | |
struct Camera* Camera_new(const v3 origin, const v3 lookAt, const v3 up, const float fov, const float aspect, | |
const float aperture, const float focalDistance) | |
{ | |
struct Camera* cam = (struct Camera*)calloc(1, sizeof(struct Camera)); | |
cam->lensRadius = aperture / 2.0F; | |
Camera_set_view(cam, origin, lookAt, up, fov, aspect, focalDistance); | |
return cam; | |
} | |
void Camera_free(struct Camera* cam) | |
{ | |
free(cam); | |
} | |
__device__ static v3 local_random_unit_disk(struct DeviceRands* rng) | |
{ | |
v3 p = v3_get(rng->nums[rng->idx++], rng->nums[rng->idx++], 0.0F); | |
v3_scale(&p, 2.0F); | |
v3_subtract(&p, v3_get(1.0F, 1.0F, 0.0F)); | |
if (v3_dot(p, p) >= 1.0F) | |
v3_scale(&p, 0.5F); | |
return p; | |
} | |
__device__ Ray Camera_ray(struct Camera* cam, const float u, const float v, struct DeviceRands* rng) | |
{ | |
v3 r = v3_scale_to(local_random_unit_disk(rng), cam->lensRadius); | |
v3 offset = v3_scale_to(cam->u, r.x); | |
v3_add(&offset, v3_scale_to(cam->v, r.y)); | |
v3 dir = cam->lowerLeft; | |
v3_add(&dir, v3_scale_to(cam->horizontal, u)); | |
v3_add(&dir, v3_scale_to(cam->vertical, v)); | |
v3 oo = v3_add_to(cam->origin, offset); | |
v3_subtract(&dir, oo); | |
return ray_get(oo, dir); | |
} | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Hit */ | |
/************************************************************************/ | |
/************************************************************************/ | |
struct Material; | |
typedef struct Hit { | |
v3 p; // Point | |
v3 nml; // Normal | |
float t; // Time (for like motion blur or w/e) | |
const struct Material* mat; | |
} Hit; | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Material */ | |
/************************************************************************/ | |
/************************************************************************/ | |
struct Material { | |
v3 albedo; | |
float fuzz; | |
float refractIdx; | |
int32_t scatterType; | |
}; | |
__device__ static v3 rnd_unit_sphere_point(struct DeviceRands* rng) | |
{ | |
v3 p = v3_get(rng->nums[rng->idx++], rng->nums[rng->idx++], rng->nums[rng->idx++]); | |
if (v3_magnitude(p) > 0.5F) | |
v3_scale(&p, 0.5F); | |
return p; | |
} | |
__device__ static float schlick(const float cosine, const float refractIdx) | |
{ | |
// https://en.wikipedia.org/wiki/Schlick%27s_approximation | |
float r0 = (1 - refractIdx) / (1 + refractIdx); | |
r0 = r0 * r0; | |
return r0 + (1.0F - r0) * powf((1.0F - cosine), 5.0f); | |
} | |
__device__ static v3 reflect(const v3 v, const v3 n) | |
{ | |
const float dot = v3_dot(v, n); | |
v3 r = v3_scale_to(n, dot * 2.0F); | |
return v3_subtract_to(v, r); | |
} | |
__device__ static bool refract(const v3 v, const v3 n, const float niOverNt, v3* outRefracted) | |
{ | |
v3 uv = v3_unit(v); | |
const float dt = v3_dot(uv, n); | |
const float discriminant = 1.0F - niOverNt * niOverNt * (1.0F - dt * dt); | |
if (discriminant > 0.0F) | |
{ | |
/*--- niOverNt * (uv - n * dt) -----------------------------------------*/ | |
v3 l = v3_scale_to(v3_subtract_to(uv, v3_scale_to(n, dt)), niOverNt); | |
/*--- n * sqrt(discriminant) -------------------------------------------*/ | |
v3 r = v3_scale_to(n, sqrtf(discriminant)); | |
*outRefracted = v3_subtract_to(l, r); | |
return true; | |
} | |
return false; | |
} | |
__device__ bool mat_scatter_lambert(const Hit* hit, v3* outAttenuation, Ray* outRay, struct DeviceRands* rng) | |
{ | |
// TODO: Possibly can scatter with some probability and then attenuation could be albedo/probability | |
v3 target = v3_add_to(hit->p, hit->nml); | |
v3_add(&target, rnd_unit_sphere_point(rng)); | |
*outRay = ray_get(hit->p, v3_subtract_to(target, hit->p)); | |
*outAttenuation = hit->mat->albedo; | |
return true; | |
} | |
__device__ bool mat_scatter_metal(const Hit* hit, const Ray* ray, v3* outAttenuation, Ray* outRay, struct DeviceRands* rng) | |
{ | |
v3 reflected = reflect(v3_unit(ray_direction(ray)), hit->nml); | |
v3 dir = v3_scale_to(rnd_unit_sphere_point(rng), hit->mat->fuzz); | |
v3_add(&dir, reflected); | |
*outRay = ray_get(hit->p, dir); | |
*outAttenuation = hit->mat->albedo; | |
return v3_dot(ray_direction(outRay), hit->nml) > 0.0F; | |
} | |
__device__ bool mat_scatter_dielectric(const Hit* hit, const Ray* ray, v3* outAttenuation, Ray* outRay, struct DeviceRands* rng) | |
{ | |
v3 oNorm; | |
float niOverNt; | |
float reflectProb; | |
float cosine; | |
/*--- This const value because glass doesn't absorb --------------------*/ | |
*outAttenuation = v3_get(1.0F, 1.0F, 1.0F); | |
if (v3_dot(ray_direction(ray), hit->nml) > 0.0F) | |
{ | |
oNorm = v3_scale_to(hit->nml, -1.0F); | |
niOverNt = hit->mat->refractIdx; | |
//cosine = mat->refractIdx * v3_dot(ray_direction(ray), hit->nml) / v3_magnitude(ray_direction(ray)); | |
cosine = v3_dot(ray_direction(ray), hit->nml) / v3_magnitude(ray_direction(ray)); | |
cosine = sqrtf(1.0F - hit->mat->refractIdx * hit->mat->refractIdx * (1.0F - cosine * cosine)); | |
} | |
else | |
{ | |
oNorm = hit->nml; | |
niOverNt = 1.0F / hit->mat->refractIdx; | |
cosine = -v3_dot(ray_direction(ray), hit->nml) / v3_magnitude(ray_direction(ray)); | |
} | |
v3 refracted; | |
v3 reflected = reflect(ray_direction(ray), hit->nml); | |
if (refract(ray_direction(ray), oNorm, niOverNt, &refracted)) | |
reflectProb = schlick(cosine, hit->mat->refractIdx); | |
else | |
reflectProb = 1.0F; | |
if (rng->nums[rng->idx++] < reflectProb) | |
*outRay = ray_get(hit->p, reflected); | |
else | |
*outRay = ray_get(hit->p, refracted); | |
return true; | |
} | |
struct Material mat_lambert(const v3 albedo) | |
{ | |
struct Material m; | |
m.albedo = albedo; | |
m.scatterType = 1; | |
return m; | |
} | |
struct Material mat_metal(const v3 albedo, const float fuzz) | |
{ | |
struct Material m; | |
m.albedo = albedo; | |
m.fuzz = fuzz; | |
m.scatterType = 2; | |
return m; | |
} | |
struct Material mat_glass(const float refractIndex) | |
{ | |
struct Material m; | |
m.refractIdx = refractIndex; | |
m.scatterType = 3; | |
return m; | |
} | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Sphere */ | |
/************************************************************************/ | |
/************************************************************************/ | |
struct Sphere { | |
v3 center; | |
float radius; | |
struct Material mat; | |
}; | |
struct Sphere sphere_get(const v3 center, const float radius, const struct Material mat) | |
{ | |
struct Sphere s; | |
s.center = center; | |
s.radius = radius; | |
s.mat = mat; | |
return s; | |
} | |
__device__ bool sphere_hit(const struct Sphere* sphere, const Ray* ray, const float tMin, float tMax, Hit* outHit) | |
{ | |
v3 oc = v3_subtract_to(ray_origin(ray), sphere->center); | |
float a = v3_dot(ray_direction(ray), ray_direction(ray)); | |
float b = v3_dot(oc, ray_direction(ray)); | |
float c = v3_dot(oc, oc) - sphere->radius * sphere->radius; | |
float discriminant = b * b - a * c; | |
if (discriminant > 0.0F) | |
{ | |
float t = 0.0F; | |
float t0 = (-b - sqrtf(discriminant)) / a; | |
float t1 = (-b + sqrtf(discriminant)) / a; | |
if (t0 < tMax && t0 > tMin) | |
t = t0; | |
else if (t1 < tMax && t1 > tMin) | |
t = t1; | |
else | |
return false; | |
outHit->t = t; | |
outHit->p = ray_point(ray, t); | |
outHit->nml = outHit->p; | |
v3_subtract(&outHit->nml, sphere->center); | |
v3_scale(&outHit->nml, 1.0F / sphere->radius); | |
outHit->mat = &sphere->mat; | |
return true; | |
} | |
return false; | |
} | |
#ifdef __cplusplus | |
} | |
#endif | |
/************************************************************************/ | |
/************************************************************************/ | |
/* Global */ | |
/************************************************************************/ | |
/************************************************************************/ | |
__device__ static void color(Ray* ray, struct Sphere* spheres, const size_t sphereCount, struct DeviceRands* rng, struct ColorPick* pick) | |
{ | |
Hit hit; | |
hit.t = FLT_MAX; | |
for (size_t i = 0; i < sphereCount; ++i) | |
{ | |
Hit check; | |
check.t = FLT_MAX; | |
sphere_hit(spheres + i, ray, 0.001F, FLT_MAX, &check); | |
/*--- Near 0 should be ignored, thus 0.001F ----------------------------*/ | |
if (check.t < hit.t) | |
hit = check; | |
} | |
if (hit.t < FLT_MAX) | |
{ | |
Ray scattered; | |
v3 attenuation; | |
bool valid = false; | |
switch (hit.mat->scatterType) | |
{ | |
case 1: | |
valid = mat_scatter_lambert(&hit, &attenuation, &scattered, rng); | |
break; | |
case 2: | |
valid = mat_scatter_metal(&hit, ray, &attenuation, &scattered, rng); | |
break; | |
case 3: | |
valid = mat_scatter_dielectric(&hit, ray, &attenuation, &scattered, rng); | |
break; | |
} | |
if (valid) | |
{ | |
pick->scatter = scattered; | |
v3_multiply(&pick->color, attenuation); | |
return; | |
} | |
else | |
{ | |
pick->color = v3_zero(); | |
pick->done = true; | |
return; | |
} | |
} | |
// 1 = blue & 0 = white | |
v3 unit = v3_unit(ray_direction(ray)); // -1 < y < 1 | |
float t = 0.5F * (unit.y + 1.0F); // Scale above to 0 < y < 1 | |
/*--- ((1.0 - t) * <1.0,1.0,1.0>) + (t * <0.5,0.7,0.1>) ----------------*/ | |
v3 start = v3_one(); | |
v3 end = v3_get(0.5F, 0.7F, 1.0F); | |
v3_scale(&end, t); | |
v3_scale(&start, 1.0F - t); | |
v3_add(&start, end); | |
v3_multiply(&pick->color, start); | |
pick->done = true; | |
} | |
static void write_image(const uint8_t* buff, const size_t len, const int32_t width, const int32_t height, const int32_t frameIdx) | |
{ | |
// TODO: The buf should be from float[] to uint8_t[] | |
assert(len > 0); | |
assert(width > 0); | |
assert(height > 0); | |
assert(len % 3 == 0); | |
char name[128]; | |
snprintf(name, 128, "frame-%d.ppm", frameIdx); | |
FILE* fp = fopen(name, "w"); | |
if (fp == NULL) | |
return; | |
char header[sizeof(PPM_HEADER_FORMAT) + 20]; | |
snprintf(header, sizeof(header), PPM_HEADER_FORMAT, width, height); | |
fwrite(header, strlen(header), 1, fp); | |
char rgb[4]; | |
const size_t colCount = (size_t)width * 3; | |
for (size_t i = 0; i < len;) | |
{ | |
for (size_t c = 0; c < colCount; ++c, ++i) | |
{ | |
_itoa(buff[i], rgb, 10); | |
fwrite(rgb, strlen(rgb), 1, fp); | |
fwrite(" ", 1, 1, fp); | |
} | |
fwrite("\n", 1, 1, fp); | |
} | |
fclose(fp); | |
} | |
static uint8_t fltoui8(const float val) | |
{ | |
return (uint8_t)(val * 255.99F); | |
} | |
static struct Sphere* get_spheres(const size_t count, size_t* outCount) | |
{ | |
MTRand r = make_rand_seed(); | |
MTRand* rnd = &r; | |
struct Sphere* spheres = (struct Sphere*)malloc(sizeof(struct Sphere) * (count + 1)); | |
size_t i = 0; | |
spheres[i++] = sphere_get(v3_get(0.0F, -1000.0F, 0.0F), 1000.0F, mat_lambert(v3_get(0.5F, 0.5F, 0.5F))); | |
const v3 m = v3_get(4.0F, 0.2F, 0.0F); | |
for (int32_t a = -5; a < 5; ++a) | |
{ | |
for (int32_t b = -5; b < 5; ++b) | |
{ | |
v3 center = v3_get(a + 0.9F * rand_float(rnd, 0.0F, 1.0F), 0.2F, b + 0.9F * rand_float(rnd, 0.0F, 1.0F)); | |
if (v3_magnitude(v3_subtract_to(center, m)) > 0.9F) | |
{ | |
float matChoice = rand_float(rnd, 0.0F, 1.0F); | |
if (matChoice < 0.8F) | |
spheres[i++] = sphere_get(center, 0.2F, mat_lambert(v3_get(rand_float(rnd, 0.0F, 1.0F), rand_float(rnd, 0.0F, 1.0F), rand_float(rnd, 0.0F, 1.0F)))); | |
else if (matChoice < 0.95F) | |
spheres[i++] = sphere_get(center, 0.2F, mat_metal(v3_get((1 + rand_float(rnd, 0.0F, 1.0F)) * 0.5F, (1 + rand_float(rnd, 0.0F, 1.0F)) * 0.5F, (1 + rand_float(rnd, 0.0F, 1.0F)) * 0.5F), rand_float(rnd, 0.0F, 1.0F) * 0.5F)); | |
else | |
spheres[i++] = sphere_get(center, 0.2F, mat_glass(1.5F)); | |
} | |
if (i == count - 3) | |
break; | |
} | |
if (i == count - 3) | |
break; | |
} | |
spheres[i++] = sphere_get(v3_get(0.0F, 1.0F, 0.0F), 1.0F, mat_glass(1.5F)); | |
spheres[i++] = sphere_get(v3_get(-4.0F, 1.0F, 0.0F), 1.0F, mat_lambert(v3_get(0.4F, 0.2F, 0.1F))); | |
spheres[i++] = sphere_get(v3_get(4.0F, 1.0F, 0.0F), 1.0F, mat_metal(v3_get(0.7F, 0.6F, 0.5F), 0.0F)); | |
*outCount = i; | |
mt_rand_free(rnd); | |
return spheres; | |
} | |
__constant__ struct Sphere cuda_Spheres[500]; | |
__global__ void dummy(struct Camera* cam, struct Sphere* spheres, size_t sphereCount, struct v3* rgb, const float* dRands) | |
{ | |
//extern __shared__ v3 samples[]; | |
int threadsPerBlock = blockDim.x * blockDim.y * blockDim.z; | |
int blocksPerGrid = gridDim.x * gridDim.y * gridDim.z; | |
int threadPositionInBlock = threadIdx.x + | |
blockDim.x * threadIdx.y + | |
blockDim.x * blockDim.y * threadIdx.z; | |
int blockPosInGrid = blockIdx.x + | |
gridDim.x * blockIdx.y + | |
gridDim.x * gridDim.y * blockIdx.z; | |
//int idx = threadPositionInBlock * blocksPerGrid + blockPosInGrid; | |
//int idx = blockPosInGrid * threadsPerBlock + threadPositionInBlock; | |
//int idx = blockIdx.y * gridDim.x + blockIdx.x; | |
int h = (blockIdx.x * blockDim.y) + threadIdx.y; | |
int idx = (blockIdx.y * gridDim.x * blockDim.y) + h; | |
rgb[idx] = v3_zero(); | |
struct ColorPick pick; | |
struct DeviceRands rng; | |
rng.idx = 0; | |
//int rndOffset = threadPositionInBlock * RAND_COUNT_THREAD; | |
int rndOffset = threadIdx.x * RAND_COUNT_THREAD; | |
//int rndOffset = (idx + threadIdx.x * RAND_COUNT_THREAD) % (RANDS_COUNT - RAND_COUNT_THREAD); | |
rng.nums = dRands + rndOffset; | |
const float u = ((float)h + rng.nums[rng.idx++]) / WIDTH; | |
//const float u = ((float)blockIdx.x + rng.nums[rng.idx++]) / WIDTH; | |
const float v = ((float)HEIGHT - blockIdx.y + rng.nums[rng.idx++]) / HEIGHT; | |
pick.scatter = Camera_ray(cam, u, v, &rng); | |
pick.done = false; | |
pick.depth = 0; | |
pick.color = v3_one(); | |
while (!pick.done && pick.depth < MAX_DEPTH) | |
{ | |
color(&pick.scatter, cuda_Spheres, sphereCount, &rng, &pick); | |
pick.depth++; | |
} | |
//samples[threadPositionInBlock] = pick.color; | |
// Block for all threads and sum up shared values | |
//__syncthreads(); | |
//if (threadPositionInBlock == 0) | |
//{ | |
// v3 c = v3_zero(); | |
// for (int i = 0; i < SAMPLES; ++i) | |
// v3_add(&c, samples[i]); | |
// rgb[idx] = c; | |
// //v3_scale(&c, 1.0F / SAMPLES); | |
// //rgb[idx] = v3_get(sqrtf(c.r), sqrtf(c.g), sqrtf(c.b)); | |
//} | |
// Say hello to atomics | |
v3 c = pick.color; | |
atomicAdd(&rgb[idx].r, c.x); | |
atomicAdd(&rgb[idx].g, c.g); | |
atomicAdd(&rgb[idx].b, c.b); | |
} | |
void render_frame(struct Sphere* spheres, size_t sphereCount) | |
{ | |
float scale = 1.0F / FRAMES; | |
float cx = 8.0F; | |
float cz = 10.0F; | |
const float circle = (float)(M_PI * 2.0F); | |
float angleDelta = circle * scale; | |
float angle = 0.0F; | |
struct Sphere* dSpheres; | |
cudaMalloc(&dSpheres, sizeof(struct Sphere) * sphereCount); | |
cudaMemcpy(dSpheres, spheres, sizeof(struct Sphere) * sphereCount, cudaMemcpyHostToDevice); | |
struct Camera* dCamera; | |
cudaMalloc(&dCamera, sizeof(struct Camera)); | |
struct v3* dCp; | |
cudaMalloc(&dCp, PIXEL_SIZE); | |
//cudaMallocManaged(&dCp, PIXEL_SIZE, cudaMemAttachGlobal); | |
struct v3* cp; | |
cudaMallocHost(&cp, PIXEL_SIZE); | |
float* dRands; | |
cudaMalloc(&dRands, RANDS_SIZE); | |
curandGenerator_t gen; | |
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MT19937); | |
dim3 block; | |
block.x = SAMPLES; | |
block.y = PIXEL_DIM_SPLIT; | |
block.z = 1; | |
dim3 grid; | |
grid.x = WIDTH / PIXEL_DIM_SPLIT; | |
grid.y = HEIGHT; | |
grid.z = 1; | |
const size_t sharedMemSize = sizeof(v3) * SAMPLES; | |
const float fov = 20.0F; | |
const float aspect = (float)WIDTH / (float)HEIGHT; | |
const v3 lookAt = v3_zero(); | |
const float aperture = 0.1F; | |
const float focalDist = 10.0F; | |
uint8_t* data = (uint8_t*)malloc(MEM_LENGTH); | |
struct Camera* cam = Camera_new(v3_get(cx, 2.0F, cz), lookAt, v3_up(), fov, aspect, aperture, focalDist); | |
cudaMemcpyToSymbol(cuda_Spheres, spheres, sphereCount * sizeof(struct Sphere), 0, cudaMemcpyHostToDevice); | |
curandGenerateUniform(gen, dRands, RANDS_COUNT); | |
for (int32_t f = 0; f < FRAMES; ++f) | |
{ | |
cam->origin.x = cx * cosf(angle) - cz * sinf(angle); | |
cam->origin.z = cz * cosf(angle) + cx * sinf(angle); | |
Camera_set_view(cam, cam->origin, lookAt, v3_up(), fov, aspect, focalDist); | |
angle += angleDelta; | |
cudaMemcpy(dCamera, cam, sizeof(struct Camera), cudaMemcpyHostToDevice); | |
//curandGenerateNormal(gen, dRands, RANDS_COUNT, 0.5F, 0.5F); | |
//dummy<<<grid, block, sharedMemSize>>>(dCamera, dSpheres, sphereCount, dCp, dRands, WIDTH * HEIGHT); | |
dummy<<<grid, block>>>(dCamera, dSpheres, sphereCount, dCp, dRands); | |
cudaDeviceSynchronize(); | |
cudaMemcpy(cp, dCp, PIXEL_SIZE, cudaMemcpyDeviceToHost); | |
size_t writeIdx = 0; | |
for (int32_t i = 0; i < WIDTH * HEIGHT; ++i) | |
{ | |
v3 c = cp[i]; | |
v3_scale(&c, 1.0F / SAMPLES); | |
c = v3_get(sqrtf(c.r), sqrtf(c.g), sqrtf(c.b)); | |
data[writeIdx++] = fltoui8(c.r); | |
data[writeIdx++] = fltoui8(c.g); | |
data[writeIdx++] = fltoui8(c.b); | |
} | |
write_image(data, MEM_LENGTH, WIDTH, HEIGHT, f); | |
} | |
curandDestroyGenerator(gen); | |
cudaFree(dSpheres); | |
cudaFree(dCamera); | |
cudaFree(dCp); | |
cudaFree(dRands); | |
cudaFreeHost(cp); | |
Camera_free(cam); | |
} | |
int main(void) | |
{ | |
size_t sphereCount = 0; | |
struct Sphere* spheres = get_spheres(SPHERE_COUNT, &sphereCount); | |
render_frame(spheres, sphereCount); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment