Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Compares the performance of fixed-point vs floating-point arithmetic on iPhone 5 (Apple A6 processor) by evaluating a contrived 'movement update' algorithm over a large array of 'particle' structs. Intended to identify best practices for developing an efficient particle engine. I recommend testing on the device with 'Release' versions, using -O3…
//
// main.m
// fp_profiler
//
// Compares the performance of fixed-point vs floating-point for simple
// "particle engine" movement loops.
//
// Optimized for iPhone 5 (Apple A6 / ARMv7)
//
// Created by Trevor Bentley on 5/29/13.
// Copyright (c) 2013 Trevor Bentley. All rights reserved.
//
#import <dispatch/dispatch.h>
#define compiler_assert(e) enum {____ASSERT = 1/(!!(e))}
/***********************************************************************
** FIXED POINT
***********************************************************************/
typedef int32_t f23Q9;
#define F23Q9_SHIFT 9
#define F23Q9_SCALE (1<<F23Q9_SHIFT)
#define INT2F23Q9(val) ( (f23Q9) ((val) << F23Q9_SHIFT ) )
#define FLOAT2F23Q9(val) ( (f23Q9) ((val) * (float)F23Q9_SCALE ) )
#define F23Q92UINT(val) ( (uint32_t) ((val) >> F23Q9_SHIFT ) )
#define F23Q92INT(val) ( (int32_t) ((val) / F23Q9_SCALE ) )
#define F23Q92FLOAT(val) ( (float) ((val) / (float)F23Q9_SCALE ) )
#define F23Q9_MUL(a,b) ( (f23Q9) ((int32_t)(a)*(b) >> F23Q9_SHIFT) )
#define F23Q9_DIV(a,b) ( (f23Q9) ((int32_t)(a)*F23Q9_SCALE / (b) ) )
/***********************************************************************
** PARTICLES
***********************************************************************/
#define CACHELINE_SIZE 64
#define PARTICLE_ALIGNMENT 32
#define PARTICLE_POOL_SIZE (1024*1024*4)
typedef struct {
int_fast16_t x;
int_fast16_t y;
struct {uint8_t r; uint8_t g; uint8_t b; uint8_t a;} __attribute__((packed)) color;
f23Q9 vx;
f23Q9 vy;
f23Q9 ax;
f23Q9 ay;
f23Q9 scale;
} __attribute__((aligned (PARTICLE_ALIGNMENT))) particle_t;
typedef struct {
int_fast16_t x;
int_fast16_t y;
struct {uint8_t r; uint8_t g; uint8_t b; uint8_t a;} __attribute__((packed)) color;
float vx;
float vy;
float ax;
float ay;
float scale;
} __attribute__((aligned (PARTICLE_ALIGNMENT))) fparticle_t;
particle_t __attribute__((aligned (CACHELINE_SIZE))) particles[PARTICLE_POOL_SIZE];
fparticle_t __attribute__((aligned (CACHELINE_SIZE))) fparticles[PARTICLE_POOL_SIZE];
compiler_assert(sizeof(particle_t) <= PARTICLE_ALIGNMENT);
/***********************************************************************
** PROFILER DEFINITIONS
***********************************************************************/
#define PROFILER_LOOP_COUNT 15
void particles_memset(void);
void particles_init(void);
void particles_async(dispatch_queue_t *dqueue);
void particles_applyloop(dispatch_queue_t *dqueue);
void particles_intsync(dispatch_queue_t *dqueue);
void particles_float(dispatch_queue_t *dqueue);
void test_fixed_macros(void);
/***********************************************************************
** PROFILER FUNCTIONS
***********************************************************************/
static void (*funcs[])(dispatch_queue_t *dqueue) =
{particles_async, particles_applyloop, particles_intsync, particles_float};
static const int func_count = sizeof(funcs)/sizeof(void(*)(void));
void particles_memset(void) {
memset(particles, 0xDEADBEEF, PARTICLE_POOL_SIZE*sizeof(particle_t));
memset(fparticles, 0xDEADBEEF, PARTICLE_POOL_SIZE*sizeof(fparticle_t));
}
void particles_init(void) {
particle_t *p;
p = particles;
for (int i = 0; i < PARTICLE_POOL_SIZE; ++i) {
p->x = arc4random() % 200;
p->y = arc4random() % 200;
memset(&(p->color), 255, sizeof(p->color));
p->vx = INT2F23Q9(arc4random() % 10 - 5);
p->vy = INT2F23Q9(arc4random() % 10 - 5);
p->ax = INT2F23Q9(arc4random() % 10 - 5);
p->ay = INT2F23Q9(arc4random() % 10 - 5);
p->scale = INT2F23Q9(arc4random() % 2);
++p;
}
fparticle_t *fp;
fp = fparticles;
for (int i = 0; i < PARTICLE_POOL_SIZE; ++i) {
fp->x = arc4random() % 200;
fp->y = arc4random() % 200;
memset(&(fp->color), 255, sizeof(fp->color));
fp->vx = (arc4random() % 10 - 5) / 5.0f;
fp->vy = (arc4random() % 10 - 5) / 5.0f;
fp->ax = (arc4random() % 10 - 5) / 5.0f;
fp->ay = (arc4random() % 10 - 5) / 5.0f;
fp->scale = (arc4random() % 10) / 20.0f;
++fp;
}
}
#define F23Q9_UNROLLED_BLOCK() \
p->x = (int_fast16_t)(p->x + F23Q92INT(F23Q9_MUL(p->vx, p->scale))); \
p->y = (int_fast16_t)(p->y + F23Q92INT(F23Q9_MUL(p->vy, p->scale))); \
p->vx = p->vx + F23Q9_MUL(p->scale,p->ax); \
p->vy = p->vy + F23Q9_DIV(p->ay,p->scale); \
++p; \
p->x = (int_fast16_t)(p->x + F23Q92INT(F23Q9_MUL(p->vx, p->scale))); \
p->y = (int_fast16_t)(p->y + F23Q92INT(F23Q9_MUL(p->vy, p->scale))); \
p->vx = p->vx + F23Q9_MUL(p->scale,p->ax); \
p->vy = p->vy + F23Q9_DIV(p->ay,p->scale); \
++p;
void particles_async(dispatch_queue_t *dqueue) {
dispatch_async(*dqueue, ^{
particle_t *p = particles;
for (int i = 0; i < PARTICLE_POOL_SIZE>>2; ++i) {
F23Q9_UNROLLED_BLOCK();
}
});
dispatch_async(*dqueue, ^{
particle_t *p = particles+(PARTICLE_POOL_SIZE>>1);
for (int i = 0; i < PARTICLE_POOL_SIZE>>2; ++i) {
F23Q9_UNROLLED_BLOCK();
}
});
dispatch_barrier_sync(*dqueue, ^{});
}
void particles_applyloop(dispatch_queue_t *dqueue) {
dispatch_apply(2, *dqueue, ^(size_t idx) {
particle_t *p = particles+(idx*PARTICLE_POOL_SIZE>>1);
for (int i = 0; i < PARTICLE_POOL_SIZE >> 2; ++i) {
F23Q9_UNROLLED_BLOCK();
}
});
}
void particles_intsync(dispatch_queue_t *dqueue) {
particle_t *p = particles;
for (int i = 0; i < PARTICLE_POOL_SIZE>>1; ++i) {
F23Q9_UNROLLED_BLOCK();
}
}
void particles_float(dispatch_queue_t *dqueue) {
fparticle_t *fp = fparticles;
for (int i = 0; i < PARTICLE_POOL_SIZE>>1; ++i) {
fp->x = (int_fast16_t)(fp->x + fp->scale*fp->vx);
fp->y = (int_fast16_t)(fp->y + fp->scale*fp->vy);
fp->vx = fp->vx + fp->scale*fp->ax;
fp->vy = fp->vy + fp->ay/fp->scale;
++fp;
fp->x = (int_fast16_t)(fp->x + fp->scale*fp->vx);
fp->y = (int_fast16_t)(fp->y + fp->scale*fp->vy);
fp->vx = fp->vx + fp->scale*fp->ax;
fp->vy = fp->vy + fp->ay/fp->scale;
++fp;
}
}
void test_fixed_macros(void) {
int myInt = 127;
uint16_t myUsInt = 1;
int myNegInt = -128;
float myFloat = 4100.12345f;
float myNegFloat = -14.45678f;
NSLog(@"Conversion:");
NSLog(@"%d -> %x -> %d", myInt, INT2F23Q9(myInt), F23Q92INT(INT2F23Q9(myInt)));
NSLog(@"%u -> %x -> %u", myUsInt, INT2F23Q9(myUsInt), F23Q92UINT(INT2F23Q9(myUsInt)));
NSLog(@"%d -> %x -> %d", myNegInt, INT2F23Q9(myNegInt), F23Q92INT(INT2F23Q9(myNegInt)));
NSLog(@"%f -> %x -> %f", myFloat, FLOAT2F23Q9(myFloat), F23Q92FLOAT(FLOAT2F23Q9(myFloat)));
NSLog(@"%f -> %x -> %f", myNegFloat, FLOAT2F23Q9(myNegFloat), F23Q92FLOAT(FLOAT2F23Q9(myNegFloat)));
NSLog(@"Multiplication:");
NSLog(@"3*4 -> %d", F23Q92INT(F23Q9_MUL(INT2F23Q9(3), INT2F23Q9(4))));
NSLog(@"3*-4 -> %d", F23Q92INT(F23Q9_MUL(INT2F23Q9(3), INT2F23Q9(-4))));
NSLog(@"-3*-4 -> %d", F23Q92INT(F23Q9_MUL(INT2F23Q9(-3), INT2F23Q9(-4))));
NSLog(@"1.25*2.36 -> %f", F23Q92FLOAT(F23Q9_MUL(FLOAT2F23Q9(1.25), FLOAT2F23Q9(2.36))));
NSLog(@"1.25*-2.36 -> %f", F23Q92FLOAT(F23Q9_MUL(FLOAT2F23Q9(1.25), FLOAT2F23Q9(-2.36))));
NSLog(@"-1.25*-2.36 -> %f", F23Q92FLOAT(F23Q9_MUL(FLOAT2F23Q9(-1.25), FLOAT2F23Q9(-2.36))));
NSLog(@"Division:");
NSLog(@"3/4 -> %f", F23Q92FLOAT(F23Q9_DIV(INT2F23Q9(3), INT2F23Q9(4))));
NSLog(@"3/-4 -> %f", F23Q92FLOAT(F23Q9_DIV(INT2F23Q9(3), INT2F23Q9(-4))));
NSLog(@"-3/-4 -> %f", F23Q92FLOAT(F23Q9_DIV(INT2F23Q9(-3), INT2F23Q9(-4))));
NSLog(@"1.25/2.36 -> %f", F23Q92FLOAT(F23Q9_DIV(FLOAT2F23Q9(1.25), FLOAT2F23Q9(2.36))));
NSLog(@"1.25/-2.36 -> %f", F23Q92FLOAT(F23Q9_DIV(FLOAT2F23Q9(1.25), FLOAT2F23Q9(-2.36))));
NSLog(@"-1.25/-2.36 -> %f", F23Q92FLOAT(F23Q9_DIV(FLOAT2F23Q9(-1.25), FLOAT2F23Q9(-2.36))));
}
int main(int argc, char *argv[])
{
CFAbsoluteTime start, end;
dispatch_queue_t dqueue = dispatch_queue_create("com.trevorbentley.fp_profiler", DISPATCH_QUEUE_CONCURRENT);
test_fixed_macros();
NSLog(@"Initializing...");
particles_memset();
particles_init();
NSLog(@"Profiling...");
for (int state = 0; state < func_count; state++) {
start = CFAbsoluteTimeGetCurrent();
for (int i = 0; i < PROFILER_LOOP_COUNT; i++) {
funcs[state](&dqueue);
}
end = CFAbsoluteTimeGetCurrent();
NSLog(@"%d: %f", state, (end-start)/PROFILER_LOOP_COUNT);
}
NSLog(@"Profiling finished.");
while(1) ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment