Created
April 20, 2012 08:47
-
-
Save iamgreaser/2427184 to your computer and use it in GitHub Desktop.
AVX 2d grid caster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
incomplete AVX-based SDL 2d-grid raycaster engine | |
by Ben "GreaseMonkey" Russell, 2012. public domain. | |
*/ | |
// note to intel: who the hell named these?!?! (also why isn't AVX "ymmintrin") | |
#include <mmintrin.h> /* MMX */ | |
#include <xmmintrin.h> /* SSE */ | |
#include <emmintrin.h> /* SSE2 */ | |
#include <pmmintrin.h> /* SSE3 */ | |
#include <tmmintrin.h> /* SSSE3 */ | |
#include <smmintrin.h> /* SSE4.1 */ | |
#include <nmmintrin.h> /* SSE4.2 */ | |
#include <immintrin.h> /* AVX */ | |
#include <unistd.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <errno.h> | |
#include <math.h> | |
#include <SDL.h> | |
struct camera | |
{ | |
float px, py; | |
float vx, vy; | |
}; | |
static struct camera basecam = {7.2, 5.1, 0.0, 1.0}; | |
static SDL_Surface *surf = NULL; | |
static uint8_t map[16][16] = { | |
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, | |
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1}, | |
{1,0,1,0,1,0,1,1,1,0,1,0,1,0,1,1}, | |
{1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1}, | |
{1,0,0,0,1,1,1,0,1,1,1,0,1,0,1,1}, | |
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1}, | |
{1,1,1,0,1,1,1,0,1,0,1,0,1,0,1,1}, | |
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1}, | |
{1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,1}, | |
{1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1}, | |
{1,0,1,0,0,1,1,0,1,0,1,0,1,0,1,1}, | |
{1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1}, | |
{1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1}, | |
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1}, | |
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, | |
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, | |
}; | |
void clear_screen(void) | |
{ | |
memset(surf->pixels, 0x00, surf->h*surf->pitch); | |
/* | |
uint8_t *v = surf->pixels; | |
int y; | |
for(y=0; y < surf->h; y++) | |
{ | |
memset(v, surf->w*4, 0x00); | |
v += surf->pitch; | |
}*/ | |
} | |
const float cf_pos1 = 1.0f; | |
const float cf_neg1 = -1.0f; | |
const float cf_cellrnd = 0.001f; | |
const float cf_distnumer = 200.0f; | |
const float cf8_ascend[8] = { | |
0.0f/8.0f, 1.0f/8.0f, 2.0f/8.0f, 3.0f/8.0f, | |
4.0f/8.0f, 5.0f/8.0f, 6.0f/8.0f, 7.0f/8.0f, | |
}; | |
float ray_result_time[800]; | |
float ray_result_x[800]; | |
uint8_t ray_result_type[800]; | |
void render_raycast_c(struct camera *cam) | |
{ | |
int x,y,i,j; | |
// load camera | |
float px = cam->px; | |
float py = cam->py; | |
float vxb = cam->vx; | |
float vyb = cam->vy; | |
// load cell + distance | |
int cxb = (int)px; | |
int cyb = (int)py; | |
float dx1 = px-(float)cxb; | |
float dy1 = py-(float)cyb; | |
float dx2 = 1.0f-dx1; | |
float dy2 = 1.0f-dy1; | |
// load velocity stuff | |
float vx = vxb-vyb; | |
float vy = vyb+vxb; | |
float dvx = vyb*2.0f/800.0f; | |
float dvy = -vxb*2.0f/800.0f; | |
for(x = 0; x < 800; x++) | |
{ | |
// calculate gx,gy | |
int gx = (vx < 0 ? -1 : 1); | |
int gy = (vy < 0 ? -1 : 1); | |
// calculate avx,avy | |
float avx = vx*(float)gx; | |
float avy = vy*(float)gy; | |
// get correct dx,dy | |
float dx = (gx == -1 ? dx1 : dx2); | |
float dy = (gy == -1 ? dy1 : dy2); | |
int cx = cxb; | |
int cy = cyb; | |
float time = 0.0f; | |
// trace | |
for(i = 0; i < 128; i++) | |
{ | |
// compare tx, ty | |
int tc = dx == 0 || (dy != 0 && dx/avx < dy/avy); | |
if(tc) | |
{ | |
float t = dx/avx; | |
dy -= avy*t; | |
time += t; | |
dx = 1.0f; | |
cx += gx; | |
} else { | |
float t = dy/avy; | |
dx -= avx*t; | |
time += t; | |
dy = 1.0f; | |
cy += gy; | |
} | |
// check cell | |
int cell = map[cy][cx]; | |
if(cell != 0) | |
{ | |
ray_result_time[x] = cf_distnumer/time; | |
break; | |
} | |
} | |
// advance velocity | |
vx += dvx; | |
vy += dvy; | |
} | |
} | |
void render_raycast_avx(struct camera *cam) | |
{ | |
static uint32_t ray_span_cx[8]; | |
static uint32_t ray_span_cy[8]; | |
static uint32_t ray_span_cval[8]; | |
int x,y,i,j; | |
// load some constants | |
__m256 ymm_zero; | |
ymm_zero = _mm256_xor_ps(ymm_zero, ymm_zero); | |
__m256i ymm_neg1i = (__m256i)_mm256_cmp_ps(ymm_zero, ymm_zero, 0); // EQ | |
__m256 ymm_neg1 = _mm256_cvtepi32_ps(ymm_neg1i); | |
__m256 ymm_pos1 = _mm256_sub_ps(ymm_zero, ymm_neg1); | |
__m256 ymm_ascend = _mm256_load_ps(cf8_ascend); | |
__m256 ymm_cellrnd = _mm256_broadcast_ss(&cf_cellrnd); | |
__m256 ymm_distnumer = _mm256_broadcast_ss(&cf_distnumer); | |
// load positions + velocities into YMM regs | |
__m256 ymm_ray_px = _mm256_broadcast_ss(&cam->px); | |
__m256 ymm_ray_py = _mm256_broadcast_ss(&cam->py); | |
__m256 ymm_ray_vxb = _mm256_broadcast_ss(&cam->vx); | |
__m256 ymm_ray_vyb = _mm256_broadcast_ss(&cam->vy); | |
// calculate cell position | |
__m256 ymm_ray_cxb = _mm256_floor_ps(ymm_ray_px); | |
__m256 ymm_ray_cyb = _mm256_floor_ps(ymm_ray_py); | |
ymm_ray_cxb = _mm256_add_ps(ymm_ray_cxb, ymm_cellrnd); | |
ymm_ray_cyb = _mm256_add_ps(ymm_ray_cyb, ymm_cellrnd); | |
// calculate the dreaded dx,dy values | |
__m256 ymm_ray_dx1 = _mm256_floor_ps(ymm_ray_px); // \_.- get rounded pos | |
__m256 ymm_ray_dy1 = _mm256_floor_ps(ymm_ray_py); // / | |
ymm_ray_dx1 = _mm256_sub_ps(ymm_ray_px, ymm_ray_dx1); // -ve direction | |
ymm_ray_dy1 = _mm256_sub_ps(ymm_ray_py, ymm_ray_dy1); // | |
__m256 ymm_ray_dx2 = _mm256_sub_ps(ymm_pos1, ymm_ray_dx1); // +ve direction | |
__m256 ymm_ray_dy2 = _mm256_sub_ps(ymm_pos1, ymm_ray_dy1); // | |
// calculate the velocity + delta values | |
// TODO: get correct arrangement of add,sub signing | |
const float cf_dvx = 16.0f/800.0f; | |
const float cf_dvy = -cf_dvx; | |
/* | |
float vx = vxb-vyb; | |
float vy = vyb+vxb; | |
float dvx = vyb*2.0f/800.0f; | |
float dvy = -vxb*2.0f/800.0f; | |
*/ | |
__m256 ymm_ray_dvx = _mm256_broadcast_ss(&cf_dvx); | |
__m256 ymm_ray_dvy = _mm256_broadcast_ss(&cf_dvy); | |
__m256 ymm_ray_vx = _mm256_sub_ps(ymm_ray_vxb, ymm_ray_vyb); | |
__m256 ymm_ray_vy = _mm256_add_ps(ymm_ray_vyb, ymm_ray_vxb); | |
ymm_ray_dvx = _mm256_mul_ps(ymm_ray_dvx, ymm_ray_vyb); | |
ymm_ray_dvy = _mm256_mul_ps(ymm_ray_dvy, ymm_ray_vxb); | |
__m256 ymm_ray_vxq = _mm256_mul_ps(ymm_ray_dvx, ymm_ascend); | |
__m256 ymm_ray_vyq = _mm256_mul_ps(ymm_ray_dvy, ymm_ascend); | |
ymm_ray_vx = _mm256_add_ps(ymm_ray_vx, ymm_ray_vxq); | |
ymm_ray_vy = _mm256_add_ps(ymm_ray_vy, ymm_ray_vyq); | |
// trace all rays | |
for(x = 0; x < 800-8+1; x += 8) | |
{ | |
// calculate gx,gy | |
__m256i ymm_ray_gxs = (__m256i)_mm256_cmp_ps(ymm_ray_vx, ymm_zero, 1); // LT | |
__m256i ymm_ray_gys = (__m256i)_mm256_cmp_ps(ymm_ray_vy, ymm_zero, 1); // LT | |
__m256 ymm_ray_gx = _mm256_blendv_ps(ymm_pos1, ymm_neg1, (__m256)ymm_ray_gxs); | |
__m256 ymm_ray_gy = _mm256_blendv_ps(ymm_pos1, ymm_neg1, (__m256)ymm_ray_gys); | |
// reset some stuff | |
__m256 ymm_ray_time = ymm_zero; | |
__m256 ymm_ray_lock = ymm_zero; | |
__m256 ymm_ray_cx = ymm_ray_cxb; | |
__m256 ymm_ray_cy = ymm_ray_cyb; | |
__m256 ymm_ray_dir = ymm_zero; | |
// select dx,dy via gx,gy | |
__m256 ymm_ray_dx = _mm256_blendv_ps(ymm_ray_dx2, ymm_ray_dx1, (__m256)ymm_ray_gxs); | |
__m256 ymm_ray_dy = _mm256_blendv_ps(ymm_ray_dy2, ymm_ray_dy1, (__m256)ymm_ray_gys); | |
// get abs velocity | |
__m256 ymm_ray_avx = _mm256_mul_ps(ymm_ray_vx, ymm_ray_gx); | |
__m256 ymm_ray_avy = _mm256_mul_ps(ymm_ray_vy, ymm_ray_gy); | |
int tsum = 8; | |
for(i = 0; i < 128; i++) // Just in case I can't code to save myself. --GM | |
{ | |
// calculate comparitive time | |
// note: | |
// tx >/< ty | |
// => dx/vx >/< dy/vy | |
// => dx*vy >/< dy*vx | |
// VMULPS could be used creatively. | |
__m256 ymm_ray_tx = _mm256_mul_ps(ymm_ray_dx, ymm_ray_avy); | |
__m256 ymm_ray_ty = _mm256_mul_ps(ymm_ray_dy, ymm_ray_avx); | |
// compare times | |
__m256 ymm_ray_tc1 = _mm256_cmp_ps(ymm_ray_ty, ymm_zero, 0); // EQ | |
__m256 ymm_ray_tc2 = _mm256_cmp_ps(ymm_ray_tx, ymm_zero, 0); // EQ | |
__m256 ymm_ray_tc3 = _mm256_cmp_ps(ymm_ray_ty, ymm_ray_tx, 2); // LTE | |
__m256 ymm_ray_tc12 = _mm256_andnot_ps(ymm_ray_tc1, ymm_ray_tc2); | |
__m256 ymm_ray_tc = _mm256_or_ps(ymm_ray_tc12, ymm_ray_tc3); | |
// advance cx,cy | |
__m256 ymm_ray_ncx = _mm256_blendv_ps(ymm_ray_gx, ymm_zero, ymm_ray_tc); | |
__m256 ymm_ray_ncy = _mm256_blendv_ps(ymm_zero, ymm_ray_gy, ymm_ray_tc); | |
ymm_ray_ncx = _mm256_add_ps(ymm_ray_ncx, ymm_ray_cx); | |
ymm_ray_ncy = _mm256_add_ps(ymm_ray_ncy, ymm_ray_cy); | |
// calculate new dx,dy | |
// separate into d1 (boundary crossed) and d2 (boundary not crossed yet) | |
__m256 ymm_ray_nv1 = _mm256_blendv_ps(ymm_ray_avx, ymm_ray_avy, ymm_ray_tc); | |
__m256 ymm_ray_nd1 = _mm256_blendv_ps(ymm_ray_dx, ymm_ray_dy, ymm_ray_tc); | |
__m256 ymm_ray_nv2 = _mm256_blendv_ps(ymm_ray_avy, ymm_ray_avx, ymm_ray_tc); | |
__m256 ymm_ray_nd2 = _mm256_blendv_ps(ymm_ray_dy, ymm_ray_dx, ymm_ray_tc); | |
// calculate time | |
__m256 ymm_ray_nt = _mm256_div_ps(ymm_ray_nd1, ymm_ray_nv1); | |
// calculate new nd2 | |
__m256 ymm_ray_nd2a = _mm256_mul_ps(ymm_ray_nt, ymm_ray_nv2); | |
ymm_ray_nd2a = _mm256_sub_ps(ymm_ray_nd2, ymm_ray_nd2a); | |
// merge results back into dx,dy | |
__m256 ymm_ray_ndx = _mm256_blendv_ps(ymm_pos1, ymm_ray_nd2a, ymm_ray_tc); | |
__m256 ymm_ray_ndy = _mm256_blendv_ps(ymm_ray_nd2a, ymm_pos1, ymm_ray_tc); | |
__m256 ymm_ray_ntime = _mm256_add_ps(ymm_ray_nt, ymm_ray_time); | |
// copy n* back to * for all suitable unlocked fields | |
ymm_ray_dx = _mm256_blendv_ps(ymm_ray_ndx, ymm_ray_dx, ymm_ray_lock); | |
ymm_ray_dy = _mm256_blendv_ps(ymm_ray_ndy, ymm_ray_dy, ymm_ray_lock); | |
ymm_ray_cx = _mm256_blendv_ps(ymm_ray_ncx, ymm_ray_cx, ymm_ray_lock); | |
ymm_ray_cy = _mm256_blendv_ps(ymm_ray_ncy, ymm_ray_cy, ymm_ray_lock); | |
ymm_ray_dir = _mm256_blendv_ps(ymm_ray_tc, ymm_ray_dir, ymm_ray_lock); | |
ymm_ray_time = _mm256_blendv_ps(ymm_ray_ntime, ymm_ray_time, ymm_ray_lock); | |
// convert cx,cy to ints + dump | |
__m256i ymm_ray_cxdump = _mm256_cvtps_epi32(ymm_ray_cx); | |
__m256i ymm_ray_cydump = _mm256_cvtps_epi32(ymm_ray_cy); | |
_mm256_store_ps((float *)ray_span_cx, (__m256)ymm_ray_cxdump); | |
_mm256_store_ps((float *)ray_span_cy, (__m256)ymm_ray_cydump); | |
// TODO: check the cell! | |
for(j = 0; j < 8; j++) | |
{ | |
if(ymm_ray_lock[j] == 0.0f) | |
{ | |
int t = ray_span_cval[j] = map[ray_span_cy[j]][ray_span_cx[j]]; | |
if(t != 0) | |
{ | |
ymm_ray_lock[j] = -1.0f; | |
tsum--; | |
//printf("%.3f\n",ymm_ray_time[j]); | |
//printf("%.3f %.3f\n", ymm_ray_cx[j], ymm_ray_cy[j]); | |
} | |
} | |
} | |
// check if everything's sorted | |
if(tsum == 0) | |
break; | |
} | |
/* | |
printf("%0.3f\n", ymm_ray_lock[0]+ymm_ray_lock[1]+ymm_ray_lock[2]+ymm_ray_lock[3] | |
+ ymm_ray_lock[4]+ymm_ray_lock[5]+ymm_ray_lock[6]+ymm_ray_lock[7]); | |
*/ | |
// invert time | |
ymm_ray_time = _mm256_div_ps(ymm_distnumer, ymm_ray_time); | |
// dump all the things | |
// TODO: calculate ray_result_x | |
_mm256_store_ps(&(ray_result_time[x]), ymm_ray_time); | |
// advance velocity | |
ymm_ray_vx = _mm256_add_ps(ymm_ray_vx, ymm_ray_dvx); | |
ymm_ray_vy = _mm256_add_ps(ymm_ray_vy, ymm_ray_dvy); | |
/* | |
// DEBUG: distance step | |
for(j = 0; j < 8; j++) | |
{ | |
float v = ray_result_time[x+j]; | |
int iv = (int)(v*10+0.5f)+300; | |
if(iv >= 0 && iv < surf->h) | |
{ | |
*(uint32_t*)(&(((uint8_t *)surf->pixels)[iv*surf->pitch+(j+x)*4])) = 0x00FFFFFF; | |
} | |
}*/ | |
} | |
} | |
void render_raycast(struct camera *cam) | |
{ | |
int x,y,i,j; | |
static int phtab[800]; | |
static int prtab[800]; | |
//render_raycast_c(cam); | |
render_raycast_avx(cam); | |
// build pillar height table | |
for(x = 0; x < 800; x++) | |
{ | |
int iv = (int)(ray_result_time[x]+0.5); | |
if(iv > 300) | |
iv = 300; | |
phtab[x] = iv; | |
prtab[x] = 0; | |
} | |
uint8_t *p1 = (uint8_t *)surf->pixels; | |
uint8_t *p2 = &((uint8_t *)surf->pixels)[surf->pitch*(surf->h-1)]; | |
long pitch = surf->pitch; | |
for(y = 300; y > 0; y--) | |
{ | |
uint32_t *rp1 = (uint32_t *)p1; | |
uint32_t *rp2 = (uint32_t *)p2; | |
// draw previous pillars, filling in voids for next time | |
int first_good_x = -1; | |
x = 0; | |
while(x < 800) | |
{ | |
int span_len = prtab[x]; | |
if(span_len == 0) | |
{ | |
if(phtab[x] >= y) | |
{ | |
if(first_good_x == -1) | |
first_good_x = x; | |
*(rp1++) = *(rp2++) = 0xFFFFFFFF; | |
} else { | |
if(first_good_x != -1) { | |
prtab[first_good_x] = x-first_good_x; | |
first_good_x = -1; | |
} | |
rp1++; | |
rp2++; | |
} | |
x++; | |
} else { | |
if(first_good_x == -1) | |
first_good_x = x; | |
memset(rp1, 0xFF, span_len*4); | |
memset(rp2, 0xFF, span_len*4); | |
x += span_len; | |
rp1 += span_len; | |
rp2 += span_len; | |
} | |
} | |
if(first_good_x != -1) | |
prtab[first_good_x] = x-first_good_x; | |
p1 += pitch; | |
p2 -= pitch; | |
} | |
} | |
void mainloop(void) | |
{ | |
int i; | |
SDL_LockSurface(surf); | |
clear_screen(); | |
SDL_UnlockSurface(surf); | |
for(i = 0; i < 200; i++) | |
{ | |
SDL_LockSurface(surf); | |
clear_screen(); | |
basecam.vx = sin(i*M_PI/100.0); | |
basecam.vy = cos(i*M_PI/100.0); | |
render_raycast(&basecam); | |
SDL_UnlockSurface(surf); | |
SDL_UpdateRect(surf, 0,0,0,0); | |
SDL_Delay(10); | |
} | |
} | |
int main(int argc, char *argv[]) | |
{ | |
SDL_Init(SDL_INIT_VIDEO); | |
surf = SDL_SetVideoMode(800, 600, 32, 0); | |
mainloop(); | |
SDL_Quit(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment