Skip to content

Instantly share code, notes, and snippets.

@iamgreaser
Created April 20, 2012 08:47
Show Gist options
  • Save iamgreaser/2427184 to your computer and use it in GitHub Desktop.
Save iamgreaser/2427184 to your computer and use it in GitHub Desktop.
AVX 2d grid caster
/*
incomplete AVX-based SDL 2d-grid raycaster engine
by Ben "GreaseMonkey" Russell, 2012. public domain.
*/
// note to intel: who the hell named these?!?! (also why isn't AVX "ymmintrin")
#include <mmintrin.h> /* MMX */
#include <xmmintrin.h> /* SSE */
#include <emmintrin.h> /* SSE2 */
#include <pmmintrin.h> /* SSE3 */
#include <tmmintrin.h> /* SSSE3 */
#include <smmintrin.h> /* SSE4.1 */
#include <nmmintrin.h> /* SSE4.2 */
#include <immintrin.h> /* AVX */
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <errno.h>
#include <math.h>
#include <SDL.h>
struct camera
{
float px, py;
float vx, vy;
};
static struct camera basecam = {7.2, 5.1, 0.0, 1.0};
static SDL_Surface *surf = NULL;
static uint8_t map[16][16] = {
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1},
{1,0,1,0,1,0,1,1,1,0,1,0,1,0,1,1},
{1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1},
{1,0,0,0,1,1,1,0,1,1,1,0,1,0,1,1},
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1},
{1,1,1,0,1,1,1,0,1,0,1,0,1,0,1,1},
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1},
{1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,1},
{1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1},
{1,0,1,0,0,1,1,0,1,0,1,0,1,0,1,1},
{1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1},
{1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1},
{1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1},
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
};
void clear_screen(void)
{
memset(surf->pixels, 0x00, surf->h*surf->pitch);
/*
uint8_t *v = surf->pixels;
int y;
for(y=0; y < surf->h; y++)
{
memset(v, surf->w*4, 0x00);
v += surf->pitch;
}*/
}
const float cf_pos1 = 1.0f;
const float cf_neg1 = -1.0f;
const float cf_cellrnd = 0.001f;
const float cf_distnumer = 200.0f;
const float cf8_ascend[8] = {
0.0f/8.0f, 1.0f/8.0f, 2.0f/8.0f, 3.0f/8.0f,
4.0f/8.0f, 5.0f/8.0f, 6.0f/8.0f, 7.0f/8.0f,
};
float ray_result_time[800];
float ray_result_x[800];
uint8_t ray_result_type[800];
void render_raycast_c(struct camera *cam)
{
int x,y,i,j;
// load camera
float px = cam->px;
float py = cam->py;
float vxb = cam->vx;
float vyb = cam->vy;
// load cell + distance
int cxb = (int)px;
int cyb = (int)py;
float dx1 = px-(float)cxb;
float dy1 = py-(float)cyb;
float dx2 = 1.0f-dx1;
float dy2 = 1.0f-dy1;
// load velocity stuff
float vx = vxb-vyb;
float vy = vyb+vxb;
float dvx = vyb*2.0f/800.0f;
float dvy = -vxb*2.0f/800.0f;
for(x = 0; x < 800; x++)
{
// calculate gx,gy
int gx = (vx < 0 ? -1 : 1);
int gy = (vy < 0 ? -1 : 1);
// calculate avx,avy
float avx = vx*(float)gx;
float avy = vy*(float)gy;
// get correct dx,dy
float dx = (gx == -1 ? dx1 : dx2);
float dy = (gy == -1 ? dy1 : dy2);
int cx = cxb;
int cy = cyb;
float time = 0.0f;
// trace
for(i = 0; i < 128; i++)
{
// compare tx, ty
int tc = dx == 0 || (dy != 0 && dx/avx < dy/avy);
if(tc)
{
float t = dx/avx;
dy -= avy*t;
time += t;
dx = 1.0f;
cx += gx;
} else {
float t = dy/avy;
dx -= avx*t;
time += t;
dy = 1.0f;
cy += gy;
}
// check cell
int cell = map[cy][cx];
if(cell != 0)
{
ray_result_time[x] = cf_distnumer/time;
break;
}
}
// advance velocity
vx += dvx;
vy += dvy;
}
}
void render_raycast_avx(struct camera *cam)
{
static uint32_t ray_span_cx[8];
static uint32_t ray_span_cy[8];
static uint32_t ray_span_cval[8];
int x,y,i,j;
// load some constants
__m256 ymm_zero;
ymm_zero = _mm256_xor_ps(ymm_zero, ymm_zero);
__m256i ymm_neg1i = (__m256i)_mm256_cmp_ps(ymm_zero, ymm_zero, 0); // EQ
__m256 ymm_neg1 = _mm256_cvtepi32_ps(ymm_neg1i);
__m256 ymm_pos1 = _mm256_sub_ps(ymm_zero, ymm_neg1);
__m256 ymm_ascend = _mm256_load_ps(cf8_ascend);
__m256 ymm_cellrnd = _mm256_broadcast_ss(&cf_cellrnd);
__m256 ymm_distnumer = _mm256_broadcast_ss(&cf_distnumer);
// load positions + velocities into YMM regs
__m256 ymm_ray_px = _mm256_broadcast_ss(&cam->px);
__m256 ymm_ray_py = _mm256_broadcast_ss(&cam->py);
__m256 ymm_ray_vxb = _mm256_broadcast_ss(&cam->vx);
__m256 ymm_ray_vyb = _mm256_broadcast_ss(&cam->vy);
// calculate cell position
__m256 ymm_ray_cxb = _mm256_floor_ps(ymm_ray_px);
__m256 ymm_ray_cyb = _mm256_floor_ps(ymm_ray_py);
ymm_ray_cxb = _mm256_add_ps(ymm_ray_cxb, ymm_cellrnd);
ymm_ray_cyb = _mm256_add_ps(ymm_ray_cyb, ymm_cellrnd);
// calculate the dreaded dx,dy values
__m256 ymm_ray_dx1 = _mm256_floor_ps(ymm_ray_px); // \_.- get rounded pos
__m256 ymm_ray_dy1 = _mm256_floor_ps(ymm_ray_py); // /
ymm_ray_dx1 = _mm256_sub_ps(ymm_ray_px, ymm_ray_dx1); // -ve direction
ymm_ray_dy1 = _mm256_sub_ps(ymm_ray_py, ymm_ray_dy1); //
__m256 ymm_ray_dx2 = _mm256_sub_ps(ymm_pos1, ymm_ray_dx1); // +ve direction
__m256 ymm_ray_dy2 = _mm256_sub_ps(ymm_pos1, ymm_ray_dy1); //
// calculate the velocity + delta values
// TODO: get correct arrangement of add,sub signing
const float cf_dvx = 16.0f/800.0f;
const float cf_dvy = -cf_dvx;
/*
float vx = vxb-vyb;
float vy = vyb+vxb;
float dvx = vyb*2.0f/800.0f;
float dvy = -vxb*2.0f/800.0f;
*/
__m256 ymm_ray_dvx = _mm256_broadcast_ss(&cf_dvx);
__m256 ymm_ray_dvy = _mm256_broadcast_ss(&cf_dvy);
__m256 ymm_ray_vx = _mm256_sub_ps(ymm_ray_vxb, ymm_ray_vyb);
__m256 ymm_ray_vy = _mm256_add_ps(ymm_ray_vyb, ymm_ray_vxb);
ymm_ray_dvx = _mm256_mul_ps(ymm_ray_dvx, ymm_ray_vyb);
ymm_ray_dvy = _mm256_mul_ps(ymm_ray_dvy, ymm_ray_vxb);
__m256 ymm_ray_vxq = _mm256_mul_ps(ymm_ray_dvx, ymm_ascend);
__m256 ymm_ray_vyq = _mm256_mul_ps(ymm_ray_dvy, ymm_ascend);
ymm_ray_vx = _mm256_add_ps(ymm_ray_vx, ymm_ray_vxq);
ymm_ray_vy = _mm256_add_ps(ymm_ray_vy, ymm_ray_vyq);
// trace all rays
for(x = 0; x < 800-8+1; x += 8)
{
// calculate gx,gy
__m256i ymm_ray_gxs = (__m256i)_mm256_cmp_ps(ymm_ray_vx, ymm_zero, 1); // LT
__m256i ymm_ray_gys = (__m256i)_mm256_cmp_ps(ymm_ray_vy, ymm_zero, 1); // LT
__m256 ymm_ray_gx = _mm256_blendv_ps(ymm_pos1, ymm_neg1, (__m256)ymm_ray_gxs);
__m256 ymm_ray_gy = _mm256_blendv_ps(ymm_pos1, ymm_neg1, (__m256)ymm_ray_gys);
// reset some stuff
__m256 ymm_ray_time = ymm_zero;
__m256 ymm_ray_lock = ymm_zero;
__m256 ymm_ray_cx = ymm_ray_cxb;
__m256 ymm_ray_cy = ymm_ray_cyb;
__m256 ymm_ray_dir = ymm_zero;
// select dx,dy via gx,gy
__m256 ymm_ray_dx = _mm256_blendv_ps(ymm_ray_dx2, ymm_ray_dx1, (__m256)ymm_ray_gxs);
__m256 ymm_ray_dy = _mm256_blendv_ps(ymm_ray_dy2, ymm_ray_dy1, (__m256)ymm_ray_gys);
// get abs velocity
__m256 ymm_ray_avx = _mm256_mul_ps(ymm_ray_vx, ymm_ray_gx);
__m256 ymm_ray_avy = _mm256_mul_ps(ymm_ray_vy, ymm_ray_gy);
int tsum = 8;
for(i = 0; i < 128; i++) // Just in case I can't code to save myself. --GM
{
// calculate comparitive time
// note:
// tx >/< ty
// => dx/vx >/< dy/vy
// => dx*vy >/< dy*vx
// VMULPS could be used creatively.
__m256 ymm_ray_tx = _mm256_mul_ps(ymm_ray_dx, ymm_ray_avy);
__m256 ymm_ray_ty = _mm256_mul_ps(ymm_ray_dy, ymm_ray_avx);
// compare times
__m256 ymm_ray_tc1 = _mm256_cmp_ps(ymm_ray_ty, ymm_zero, 0); // EQ
__m256 ymm_ray_tc2 = _mm256_cmp_ps(ymm_ray_tx, ymm_zero, 0); // EQ
__m256 ymm_ray_tc3 = _mm256_cmp_ps(ymm_ray_ty, ymm_ray_tx, 2); // LTE
__m256 ymm_ray_tc12 = _mm256_andnot_ps(ymm_ray_tc1, ymm_ray_tc2);
__m256 ymm_ray_tc = _mm256_or_ps(ymm_ray_tc12, ymm_ray_tc3);
// advance cx,cy
__m256 ymm_ray_ncx = _mm256_blendv_ps(ymm_ray_gx, ymm_zero, ymm_ray_tc);
__m256 ymm_ray_ncy = _mm256_blendv_ps(ymm_zero, ymm_ray_gy, ymm_ray_tc);
ymm_ray_ncx = _mm256_add_ps(ymm_ray_ncx, ymm_ray_cx);
ymm_ray_ncy = _mm256_add_ps(ymm_ray_ncy, ymm_ray_cy);
// calculate new dx,dy
// separate into d1 (boundary crossed) and d2 (boundary not crossed yet)
__m256 ymm_ray_nv1 = _mm256_blendv_ps(ymm_ray_avx, ymm_ray_avy, ymm_ray_tc);
__m256 ymm_ray_nd1 = _mm256_blendv_ps(ymm_ray_dx, ymm_ray_dy, ymm_ray_tc);
__m256 ymm_ray_nv2 = _mm256_blendv_ps(ymm_ray_avy, ymm_ray_avx, ymm_ray_tc);
__m256 ymm_ray_nd2 = _mm256_blendv_ps(ymm_ray_dy, ymm_ray_dx, ymm_ray_tc);
// calculate time
__m256 ymm_ray_nt = _mm256_div_ps(ymm_ray_nd1, ymm_ray_nv1);
// calculate new nd2
__m256 ymm_ray_nd2a = _mm256_mul_ps(ymm_ray_nt, ymm_ray_nv2);
ymm_ray_nd2a = _mm256_sub_ps(ymm_ray_nd2, ymm_ray_nd2a);
// merge results back into dx,dy
__m256 ymm_ray_ndx = _mm256_blendv_ps(ymm_pos1, ymm_ray_nd2a, ymm_ray_tc);
__m256 ymm_ray_ndy = _mm256_blendv_ps(ymm_ray_nd2a, ymm_pos1, ymm_ray_tc);
__m256 ymm_ray_ntime = _mm256_add_ps(ymm_ray_nt, ymm_ray_time);
// copy n* back to * for all suitable unlocked fields
ymm_ray_dx = _mm256_blendv_ps(ymm_ray_ndx, ymm_ray_dx, ymm_ray_lock);
ymm_ray_dy = _mm256_blendv_ps(ymm_ray_ndy, ymm_ray_dy, ymm_ray_lock);
ymm_ray_cx = _mm256_blendv_ps(ymm_ray_ncx, ymm_ray_cx, ymm_ray_lock);
ymm_ray_cy = _mm256_blendv_ps(ymm_ray_ncy, ymm_ray_cy, ymm_ray_lock);
ymm_ray_dir = _mm256_blendv_ps(ymm_ray_tc, ymm_ray_dir, ymm_ray_lock);
ymm_ray_time = _mm256_blendv_ps(ymm_ray_ntime, ymm_ray_time, ymm_ray_lock);
// convert cx,cy to ints + dump
__m256i ymm_ray_cxdump = _mm256_cvtps_epi32(ymm_ray_cx);
__m256i ymm_ray_cydump = _mm256_cvtps_epi32(ymm_ray_cy);
_mm256_store_ps((float *)ray_span_cx, (__m256)ymm_ray_cxdump);
_mm256_store_ps((float *)ray_span_cy, (__m256)ymm_ray_cydump);
// TODO: check the cell!
for(j = 0; j < 8; j++)
{
if(ymm_ray_lock[j] == 0.0f)
{
int t = ray_span_cval[j] = map[ray_span_cy[j]][ray_span_cx[j]];
if(t != 0)
{
ymm_ray_lock[j] = -1.0f;
tsum--;
//printf("%.3f\n",ymm_ray_time[j]);
//printf("%.3f %.3f\n", ymm_ray_cx[j], ymm_ray_cy[j]);
}
}
}
// check if everything's sorted
if(tsum == 0)
break;
}
/*
printf("%0.3f\n", ymm_ray_lock[0]+ymm_ray_lock[1]+ymm_ray_lock[2]+ymm_ray_lock[3]
+ ymm_ray_lock[4]+ymm_ray_lock[5]+ymm_ray_lock[6]+ymm_ray_lock[7]);
*/
// invert time
ymm_ray_time = _mm256_div_ps(ymm_distnumer, ymm_ray_time);
// dump all the things
// TODO: calculate ray_result_x
_mm256_store_ps(&(ray_result_time[x]), ymm_ray_time);
// advance velocity
ymm_ray_vx = _mm256_add_ps(ymm_ray_vx, ymm_ray_dvx);
ymm_ray_vy = _mm256_add_ps(ymm_ray_vy, ymm_ray_dvy);
/*
// DEBUG: distance step
for(j = 0; j < 8; j++)
{
float v = ray_result_time[x+j];
int iv = (int)(v*10+0.5f)+300;
if(iv >= 0 && iv < surf->h)
{
*(uint32_t*)(&(((uint8_t *)surf->pixels)[iv*surf->pitch+(j+x)*4])) = 0x00FFFFFF;
}
}*/
}
}
void render_raycast(struct camera *cam)
{
int x,y,i,j;
static int phtab[800];
static int prtab[800];
//render_raycast_c(cam);
render_raycast_avx(cam);
// build pillar height table
for(x = 0; x < 800; x++)
{
int iv = (int)(ray_result_time[x]+0.5);
if(iv > 300)
iv = 300;
phtab[x] = iv;
prtab[x] = 0;
}
uint8_t *p1 = (uint8_t *)surf->pixels;
uint8_t *p2 = &((uint8_t *)surf->pixels)[surf->pitch*(surf->h-1)];
long pitch = surf->pitch;
for(y = 300; y > 0; y--)
{
uint32_t *rp1 = (uint32_t *)p1;
uint32_t *rp2 = (uint32_t *)p2;
// draw previous pillars, filling in voids for next time
int first_good_x = -1;
x = 0;
while(x < 800)
{
int span_len = prtab[x];
if(span_len == 0)
{
if(phtab[x] >= y)
{
if(first_good_x == -1)
first_good_x = x;
*(rp1++) = *(rp2++) = 0xFFFFFFFF;
} else {
if(first_good_x != -1) {
prtab[first_good_x] = x-first_good_x;
first_good_x = -1;
}
rp1++;
rp2++;
}
x++;
} else {
if(first_good_x == -1)
first_good_x = x;
memset(rp1, 0xFF, span_len*4);
memset(rp2, 0xFF, span_len*4);
x += span_len;
rp1 += span_len;
rp2 += span_len;
}
}
if(first_good_x != -1)
prtab[first_good_x] = x-first_good_x;
p1 += pitch;
p2 -= pitch;
}
}
void mainloop(void)
{
int i;
SDL_LockSurface(surf);
clear_screen();
SDL_UnlockSurface(surf);
for(i = 0; i < 200; i++)
{
SDL_LockSurface(surf);
clear_screen();
basecam.vx = sin(i*M_PI/100.0);
basecam.vy = cos(i*M_PI/100.0);
render_raycast(&basecam);
SDL_UnlockSurface(surf);
SDL_UpdateRect(surf, 0,0,0,0);
SDL_Delay(10);
}
}
int main(int argc, char *argv[])
{
SDL_Init(SDL_INIT_VIDEO);
surf = SDL_SetVideoMode(800, 600, 32, 0);
mainloop();
SDL_Quit();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment