Last active
February 5, 2021 08:18
-
-
Save amadio/69c8524af31e50ff2e717ba3137bf005 to your computer and use it in GitHub Desktop.
Example of how to handle data on CPU/GPU using SoA layout to obtain good performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cstddef> | |
#include <cstdint> | |
#include <cstring> | |
#include <typeinfo> | |
#include <utility> | |
#ifndef __CUDACC__ | |
# define __host__ | |
# define __device__ | |
# define __global__ | |
#endif | |
struct track { | |
/* event structure */ | |
int32_t id; | |
int32_t parent; | |
/* geometry data */ | |
float x; | |
float y; | |
float z; | |
int32_t geometry_id; | |
/* physics data */ | |
float vx; | |
float vy; | |
float vz; | |
float E; | |
int32_t material_id; | |
/* time */ | |
float global_time; | |
float proper_time; | |
/* generic 12 bytes cache to be shared between systems */ | |
char cache[12]; | |
/* rng state (64 bytes) */ | |
char state[64]; | |
static constexpr float mass = 0.511; | |
}; | |
// With sizeof(track) == 128, we have: | |
// 4KB pages with 32 tracks per page | |
// 2MB blocks with 512 pages of 4KB (16384 tracks per block) | |
// 1GB slabs with 512 blocks of 2MB (8388608 tracks per slab) | |
struct slab { | |
static constexpr int tracks_per_page = 32; | |
static constexpr int pages_per_block = 512; | |
static constexpr int blocks_per_slab = 512; | |
struct track tracks[blocks_per_slab][pages_per_block][tracks_per_page]; | |
}; | |
/* Below is a simple example function we'd like to reuse across CPU/GPU */ | |
__host__ __device__ | |
float kinetic_energy(float mass, float vx, float vy, float vz) | |
{ | |
return 0.5 * mass * (vx*vx + vy*vy + vz*vz); | |
} | |
/* This is how to process all tracks in the native AoS layout on CPU */ | |
void update_kinetic_energy_cpu_aos(struct slab* s) | |
{ | |
#pragma omp parallel for | |
for (int i = 0; i < slab::blocks_per_slab; ++i) | |
for (int j = 0; j < slab::pages_per_block; ++j) | |
for (track& t : s->tracks[i][j]) | |
t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz); | |
} | |
/* This is how to process all tracks in the native AoS layout on GPU */ | |
#ifdef __CUDACC__ | |
__global__ void update_kintetic_energy_gpu_aos(struct slab* s) | |
{ | |
track& t = s->tracks[blockIdx.x][threadIdx.y][threadIdx.x]; | |
t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz); | |
} | |
// How to call the kernel above: | |
// dim3 blocks(slab::blocks_per_slab); | |
// dim3 threads(slab::pages_per_block, slab::tracks_per_page); | |
// update_kinetic_energy_gpu_aos<<<blocks, threads>>>(s); | |
#endif | |
/* These are auxiliary macros to help reinterpret a page of tracks as if it were in SoA format */ | |
#define get_component_type(var, member) \ | |
decltype(std::remove_pointer<decltype(var)>::type::member) | |
#define get_component_offset(var, member, n) \ | |
((char*)var + n * offsetof(std::remove_pointer<decltype(var)>::type, member)) | |
#define get_component(var, member, n) \ | |
(get_component_type(var, member)*) get_component_offset(var, member, n) | |
/* This is how to process all tracks in SoA layout on CPU */ | |
void update_kinetic_energy_cpu_soa(struct slab* s) | |
{ | |
#pragma omp parallel for | |
for (int i = 0; i < slab::blocks_per_slab; ++i) { | |
for (int j = 0; j < slab::pages_per_block; ++j) { | |
track* page = s->tracks[i][j]; | |
auto vx = get_component(page, vx, slab::tracks_per_page); | |
auto vy = get_component(page, vy, slab::tracks_per_page); | |
auto vz = get_component(page, vz, slab::tracks_per_page); | |
auto E = get_component(page, E, slab::tracks_per_page); | |
for (int i = 0; i < slab::tracks_per_page; ++i) | |
E[i] = kinetic_energy(track::mass, vx[i], vy[i], vz[i]); | |
} | |
} | |
} | |
/* This is how to process all tracks in SoA layout on GPU */ | |
#ifdef __CUDACC__ | |
__global__ void update_kinetic_energy_gpu_soa(struct slab* s) | |
{ | |
track* page = s->tracks[blockIdx.x][threadIdx.y]; | |
auto vx = get_component(page, vx, blockDim.x); | |
auto vy = get_component(page, vy, blockDim.x); | |
auto vz = get_component(page, vz, blockDim.x); | |
auto E = get_component(page, E, blockDim.x); | |
auto i = threadIdx.x; | |
E[i] = kinetic_energy(track::mass, vx[i], vy[i], vz[i]); | |
} | |
// How to call the kernel above: | |
// dim3 blocks(slab::blocks_per_slab); | |
// dim3 threads(slab::pages_per_block, slab::tracks_per_page); | |
// compute_energy_gpu_soa<<<blocks, threads>>>(s); | |
#endif | |
int main(int argc, char **argv) | |
{ | |
slab *s = new slab(); | |
if (argc < 2) { | |
printf("Usage: %s [aos|soa]\n", argv[0]); | |
return 0; | |
} | |
while (argc > 1) { | |
if (strcmp(argv[--argc], "aos") == 0) | |
update_kinetic_energy_cpu_aos(s); | |
else if (strcmp(argv[--argc], "soa") == 0) | |
update_kinetic_energy_cpu_soa(s); | |
} | |
delete s; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a measurement of performance on my machine (AMD Ryzen 3950X):
So, about 4x faster for processing data with SoA (a lot less L1 dcache loads and less instructions due to vectorization).