Skip to content

Instantly share code, notes, and snippets.

@danielealbano
Last active December 23, 2019 00:19
Show Gist options
  • Save danielealbano/b631e6d80a10d248ebcc527d32090c34 to your computer and use it in GitHub Desktop.
Save danielealbano/b631e6d80a10d248ebcc527d32090c34 to your computer and use it in GitHub Desktop.
Test - OOP vs DOD data processing - C
// compile with
// gcc -O3 -m64 -mavx2 -funroll-loops -march=native -o test-oop-vs-dod-simple-data-processing test-oop-vs-dod-simple-data-processing.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
// The compiler will not strip out anything related to this variable because it will assume it can be used by other
// code that is outside the scope of the code using it
volatile uint32_t foo = 0;
// Data structures used to build and print out the stats, they use hard-defined char buffers, the text is all
// defined in the code and will never get past the boundaries
struct test_report
{
uint32_t game_objects_count;
uint64_t cpu_cycles;
uint64_t allocated_memory;
};
typedef struct test_report test_report_t;
typedef int (*test_fn_t)(uint64_t game_object_count, test_report_t* report);
struct test
{
char name[100];
test_fn_t fn;
test_report_t* reports;
};
typedef struct test test_t;
// Support functions to get the cpu cycles
inline uint64_t cpu_cycles_start()
{
uint64_t cycles;
asm volatile(
"lfence\n\t"
"rdtsc\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(cycles)
:
// "memory" avoids reordering. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rdx", "memory", "cc");
return cycles;
}
inline uint64_t cpu_cycles_end()
{
uint64_t cycles;
asm volatile(
"rdtscp\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(cycles)
:
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rcx", "rdx", "memory", "cc");
return cycles;
}
// Object Oriented Pattern
struct oop_game_object
{
uint8_t deleted;
uint8_t is_visible;
uint16_t object_type;
uint32_t mesh_id;
struct
{
double x;
double y;
double z;
} position;
struct
{
double x;
double y;
double z;
} orientation;
struct
{
bool has_animation;
uint32_t animation_id;
uint32_t frame;
} animation;
};
typedef struct oop_game_object oop_game_object_t;
int test_oop_loop(uint64_t game_object_count, test_report_t* report)
{
uint64_t allocated_memory = 0;
allocated_memory += sizeof(oop_game_object_t) * game_object_count;
// Allocate memory
oop_game_object_t* oop_game_objects = (oop_game_object_t*)malloc(sizeof(oop_game_object_t) * game_object_count);
if (oop_game_objects == NULL)
{
perror("Unable to allocate oop_game_objects");
return -1;
}
// Init the memory for the test
memset(oop_game_objects, 0, sizeof(oop_game_object_t) * game_object_count);
for(uint64_t index = 0; index < game_object_count / 2; index++)
{
oop_game_objects[index].is_visible = 1;
}
// Loop of the data - warmup
// This gives an HUGE boost to the OOP test, the data will be copied into the L2/L3 cache and because they are
// reused right after it's very likely that most of them will still be there, this will reduce the cache misses a lot.
for(uint64_t index = 0; index < game_object_count; index++)
{
if (oop_game_objects[index].deleted == 1)
{
continue;
}
if (oop_game_objects[index].is_visible == 1)
{
// This assigment is used to avoid that the compiler strips out the entire if because not being in use as
// part of the optimization process, process that we want to keep in place for the performance comparison
foo = index;
}
}
// Get the initial cpu cycles count
uint64_t cpu_cycles = cpu_cycles_start();
// Loop of the data - very simple algorithm
for(uint64_t index = 0; index < game_object_count; index++)
{
if (oop_game_objects[index].deleted == 1)
{
continue;
}
if (oop_game_objects[index].is_visible == 1)
{
// This assignment is used to avoid that the compiler strips out the entire if because not being in use as
// part of the optimization process, process that we want to keep in place for the performance comparison
foo = index;
}
}
// Calculate the CPU cycles
cpu_cycles = cpu_cycles_end() - cpu_cycles;
free(oop_game_objects);
// Update the report
report->cpu_cycles = cpu_cycles;
report->game_objects_count = game_object_count;
report->allocated_memory = allocated_memory;
return 0;
}
//
// Data Oriented Design
//
typedef uint8_t dod_game_object_deleted_t;
typedef uint8_t dod_game_object_is_visible_t;
struct dod_game_object
{
uint16_t object_type;
uint32_t mesh_id;
struct
{
double x;
double y;
double z;
} position;
struct
{
double x;
double y;
double z;
} orientation;
struct
{
bool has_animation;
uint32_t animation_id;
uint32_t frame;
} animation;
};
typedef struct dod_game_object dod_game_object_t;
int test_dod_loop(uint64_t game_object_count, test_report_t* report)
{
uint64_t allocated_memory = 0;
allocated_memory += sizeof(dod_game_object_t) * game_object_count;
allocated_memory += sizeof(dod_game_object_deleted_t) * game_object_count;
allocated_memory += sizeof(dod_game_object_is_visible_t) * game_object_count;
// Allocate the memory
dod_game_object_t* dod_game_objects =
(dod_game_object_t*)malloc(sizeof(dod_game_object_t) * game_object_count);
if (dod_game_objects == NULL)
{
perror("Unable to allocate dod_game_objects");
return -1;
}
dod_game_object_deleted_t* dod_game_object_deleted =
(dod_game_object_deleted_t*)malloc(sizeof(dod_game_object_deleted_t) * game_object_count);
if (dod_game_object_deleted == NULL)
{
perror("Unable to allocate dod_game_object_deleted");
return -2;
}
dod_game_object_is_visible_t* dod_game_object_is_visible =
(dod_game_object_is_visible_t*)malloc(sizeof(dod_game_object_is_visible_t) * game_object_count);
if (dod_game_object_is_visible == NULL)
{
perror("Unable to allocate dod_game_object_is_visible");
return -3;
}
// Init the memory for the test
memset(dod_game_objects, 0, sizeof(dod_game_object_t) * game_object_count);
memset(dod_game_objects, 0, sizeof(dod_game_object_deleted_t) * game_object_count);
memset(dod_game_objects, 0, sizeof(dod_game_object_is_visible_t) * game_object_count);
for(uint64_t index = 0; index < game_object_count / 2; index++)
{
dod_game_object_is_visible[index] = 1;
}
// Loop of the data - warmup
// This gives an HUGE boost to the OOP test, the data will be copied into the L2/L3 cache and because they are
// reused right after it's very likely that most of them will still be there, this will reduce the cache misses a lot.
for(uint64_t index = 0; index < game_object_count; index++)
{
if (dod_game_object_deleted[index] == 1)
{
continue;
}
if (dod_game_object_is_visible[index] == 1)
{
// This assigment is used to avoid that the compiler strips out the entire if because not being in use as
// part of the optimization process, process that we want to keep in place for the performance comparison
foo = index;
}
}
// Get the initial cpu cycles count
uint64_t cpu_cycles = cpu_cycles_start();
// Loop of the data - very simple algorithm
for(uint64_t index = 0; index < game_object_count; index++)
{
if (dod_game_object_deleted[index] == 1)
{
continue;
}
if (dod_game_object_is_visible[index] == 1)
{
// This assignment is used to avoid that the compiler strips out the entire if because not being in use as
// part of the optimization process, process that we want to keep in place for the performance comparison
foo = index;
}
}
// Calculate the CPU cycles
cpu_cycles = cpu_cycles_end() - cpu_cycles;
free(dod_game_objects);
// Update the report
report->cpu_cycles = cpu_cycles;
report->game_objects_count = game_object_count;
report->allocated_memory = allocated_memory;
return 0;
}
int main(int argc, char** argv)
{
test_t* test;
uint8_t runs = 10;
uint32_t game_objects_counts[] = { 1000, 10000, 100000, 1000000, 10000000 };
uint32_t tests_count = sizeof(game_objects_counts) / sizeof(uint32_t);
test_t tests[] = {
{
.name = "Object Oriented Pattern",
.fn = &test_oop_loop,
.reports = (test_report_t*)malloc(sizeof(test_report_t) * tests_count)
},
{
.name = "Data Oriented Design",
.fn = &test_dod_loop,
.reports = (test_report_t*)malloc(sizeof(test_report_t) * tests_count)
},
{
0,0,0
}
};
test = tests;
do
{
printf("Testing %s\n", test->name);
for(uint32_t loop_index = 0; loop_index < tests_count; loop_index++)
{
uint32_t game_objects_count = game_objects_counts[loop_index];
printf(" - loop %d, game_objects_count = %d (%d runs)\n", loop_index, game_objects_count, runs);
uint64_t cpu_cycles = 0;
for(uint8_t run_index = 0; run_index < runs; run_index++)
{
int res = (*test->fn)(game_objects_count, &test->reports[loop_index]);
if (res != 0)
{
fprintf(stderr, "Error during the execution, unable to continue\n");
return -1;
}
cpu_cycles += test->reports[loop_index].cpu_cycles;
}
test->reports[loop_index].cpu_cycles = cpu_cycles / runs;
}
printf("\n");
}
while(*(++test)->name != 0);
// Print the report
uint32_t separator_length = 28 + (18 * tests_count + 1);
printf("REPORT - CPU CYCLES (LOOP TOTAL / LOOP ITERATION)\n");
for(uint32_t i = 0; i < separator_length; i++) printf("-");
printf("\n");
// Header
printf("| %25s ", "ARRAY SIZE");
for(uint32_t loop_index = 0; loop_index < tests_count; loop_index++)
{
printf("| %15d ", game_objects_counts[loop_index]);
}
printf("|\n");
for(uint32_t i = 0; i < separator_length; i++) printf("-");
printf("\n");
test = tests;
do
{
printf("| %25s ", test->name);
for(uint32_t loop_index = 0; loop_index < tests_count; loop_index++)
{
printf("| %9lu ", test->reports[loop_index].cpu_cycles);
printf("| %3lu ", test->reports[loop_index].cpu_cycles / test->reports[loop_index].game_objects_count);
}
printf("|\n");
}
while(*(++test)->name != 0);
for(uint32_t i = 0; i < separator_length; i++) printf("-");
printf("\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment