Last active
February 5, 2021 08:18
-
-
Save amadio/69c8524af31e50ff2e717ba3137bf005 to your computer and use it in GitHub Desktop.
Example of how to handle data on CPU/GPU using SoA layout to obtain good performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cstddef> | |
#include <cstdint> | |
#include <cstring> | |
#include <typeinfo> | |
#include <utility> | |
#ifndef __CUDACC__ | |
# define __host__ | |
# define __device__ | |
# define __global__ | |
#endif | |
struct track { | |
/* event structure */ | |
int32_t id; | |
int32_t parent; | |
/* geometry data */ | |
float x; | |
float y; | |
float z; | |
int32_t geometry_id; | |
/* physics data */ | |
float vx; | |
float vy; | |
float vz; | |
float E; | |
int32_t material_id; | |
/* time */ | |
float global_time; | |
float proper_time; | |
/* generic 12 bytes cache to be shared between systems */ | |
char cache[12]; | |
/* rng state (64 bytes) */ | |
char state[64]; | |
static constexpr float mass = 0.511; | |
}; | |
// With sizeof(track) == 128, we have: | |
// 4KB pages with 32 tracks per page | |
// 2MB blocks with 512 pages of 4KB (16384 tracks per block) | |
// 1GB slabs with 512 blocks of 2MB (8388608 tracks per slab) | |
struct slab { | |
static constexpr int tracks_per_page = 32; | |
static constexpr int pages_per_block = 512; | |
static constexpr int blocks_per_slab = 512; | |
struct track tracks[blocks_per_slab][pages_per_block][tracks_per_page]; | |
}; | |
/* Below is a simple example function we'd like to reuse across CPU/GPU */ | |
__host__ __device__ | |
float kinetic_energy(float mass, float vx, float vy, float vz) | |
{ | |
return 0.5 * mass * (vx*vx + vy*vy + vz*vz); | |
} | |
/* This is how to process all tracks in the native AoS layout on CPU */ | |
void update_kinetic_energy_cpu_aos(struct slab* s) | |
{ | |
#pragma omp parallel for | |
for (int i = 0; i < slab::blocks_per_slab; ++i) | |
for (int j = 0; j < slab::pages_per_block; ++j) | |
for (track& t : s->tracks[i][j]) | |
t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz); | |
} | |
/* This is how to process all tracks in the native AoS layout on GPU */ | |
#ifdef __CUDACC__ | |
__global__ void update_kintetic_energy_gpu_aos(struct slab* s) | |
{ | |
track& t = s->tracks[blockIdx.x][threadIdx.y][threadIdx.x]; | |
t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz); | |
} | |
// How to call the kernel above: | |
// dim3 blocks(slab::blocks_per_slab); | |
// dim3 threads(slab::pages_per_block, slab::tracks_per_page); | |
// update_kinetic_energy_gpu_aos<<<blocks, threads>>>(s); | |
#endif | |
/* These are auxiliary macros to help reinterpret a page of tracks as if it were in SoA format */ | |
#define get_component_type(var, member) \ | |
decltype(std::remove_pointer<decltype(var)>::type::member) | |
#define get_component_offset(var, member, n) \ | |
((char*)var + n * offsetof(std::remove_pointer<decltype(var)>::type, member)) | |
#define get_component(var, member, n) \ | |
(get_component_type(var, member)*) get_component_offset(var, member, n) | |
/* This is how to process all tracks in SoA layout on CPU */ | |
void update_kinetic_energy_cpu_soa(struct slab* s) | |
{ | |
#pragma omp parallel for | |
for (int i = 0; i < slab::blocks_per_slab; ++i) { | |
for (int j = 0; j < slab::pages_per_block; ++j) { | |
track* page = s->tracks[i][j]; | |
auto vx = get_component(page, vx, slab::tracks_per_page); | |
auto vy = get_component(page, vy, slab::tracks_per_page); | |
auto vz = get_component(page, vz, slab::tracks_per_page); | |
auto E = get_component(page, E, slab::tracks_per_page); | |
for (int i = 0; i < slab::tracks_per_page; ++i) | |
E[i] = kinetic_energy(track::mass, vx[i], vy[i], vz[i]); | |
} | |
} | |
} | |
/* This is how to process all tracks in SoA layout on GPU */ | |
#ifdef __CUDACC__ | |
__global__ void update_kinetic_energy_gpu_soa(struct slab* s) | |
{ | |
track* page = s->tracks[blockIdx.x][threadIdx.y]; | |
auto vx = get_component(page, vx, blockDim.x); | |
auto vy = get_component(page, vy, blockDim.x); | |
auto vz = get_component(page, vz, blockDim.x); | |
auto E = get_component(page, E, blockDim.x); | |
auto i = threadIdx.x; | |
E[i] = kinetic_energy(track::mass, vx[i], vy[i], vz[i]); | |
} | |
// How to call the kernel above: | |
// dim3 blocks(slab::blocks_per_slab); | |
// dim3 threads(slab::pages_per_block, slab::tracks_per_page); | |
// compute_energy_gpu_soa<<<blocks, threads>>>(s); | |
#endif | |
int main(int argc, char **argv) | |
{ | |
slab *s = new slab(); | |
if (argc < 2) { | |
printf("Usage: %s [aos|soa]\n", argv[0]); | |
return 0; | |
} | |
while (argc > 1) { | |
if (strcmp(argv[--argc], "aos") == 0) | |
update_kinetic_energy_cpu_aos(s); | |
else if (strcmp(argv[--argc], "soa") == 0) | |
update_kinetic_energy_cpu_soa(s); | |
} | |
delete s; | |
return 0; | |
} |
Does this work?
- t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz); + t.E = kinetic_energy(t.mass, t.vx, t.vy, t.vz);I assume it does. The reason I would do that is that for a different entity, this could actually be variable, and the calling code doesn't have to know the difference.
No, it doesn't because mass
is a static member of track
.
This is a measurement of performance on my machine (AMD Ryzen 3950X):
Performance counter stats for 'AoS' (20 runs):
483.46 msec task-clock # 0.999 CPUs utilized ( +- 0.08% )
1 context-switches # 0.001 K/sec ( +- 20.99% )
0 cpu-migrations # 0.000 K/sec
1138 page-faults # 0.002 M/sec ( +- 0.02% )
1687395641 cycles # 3.490 GHz ( +- 0.08% ) (38.19%)
106037669 stalled-cycles-frontend # 6.28% frontend cycles idle ( +- 1.62% ) (40.26%)
1283753418 stalled-cycles-backend # 76.08% backend cycles idle ( +- 0.30% ) (42.32%)
1996927212 instructions # 1.18 insn per cycle
# 0.64 stalled cycles per insn ( +- 0.25% ) (44.39%)
19497066 branches # 40.328 M/sec ( +- 0.71% ) (46.46%)
40196 branch-misses # 0.21% of all branches ( +- 0.53% ) (48.53%)
622021303 L1-dcache-loads # 1286.609 M/sec ( +- 0.09% ) (49.17%)
123124358 L1-dcache-load-misses # 19.79% of all L1-dcache accesses ( +- 0.10% ) (47.33%)
42493039 L1-icache-loads # 87.894 M/sec ( +- 0.24% ) (45.27%)
149328 L1-icache-load-misses # 0.35% of all L1-icache accesses ( +- 0.47% ) (43.20%)
31003 dTLB-loads # 0.064 M/sec ( +- 1.87% ) (41.13%)
8598 dTLB-load-misses # 27.73% of all dTLB cache accesses ( +- 3.06% ) (39.06%)
26 iTLB-loads # 0.055 K/sec ( +- 43.50% ) (37.46%)
19 iTLB-load-misses # 72.30% of all iTLB cache accesses ( +- 54.59% ) (37.23%)
0.484160 +- 0.000367 seconds time elapsed ( +- 0.08% )
Performance counter stats for 'SoA' (20 runs):
118.78 msec task-clock # 0.994 CPUs utilized ( +- 0.10% )
1 context-switches # 0.010 K/sec ( +- 9.75% )
0 cpu-migrations # 0.000 K/sec
1138 page-faults # 0.010 M/sec ( +- 0.02% )
414521761 cycles # 3.490 GHz ( +- 0.10% ) (32.65%)
131647110 stalled-cycles-frontend # 31.76% frontend cycles idle ( +- 2.02% ) (32.65%)
182606710 stalled-cycles-backend # 44.05% backend cycles idle ( +- 3.40% ) (32.66%)
122945544 instructions # 0.30 insn per cycle
# 1.49 stalled cycles per insn ( +- 1.73% ) (32.73%)
17639259 branches # 148.508 M/sec ( +- 0.36% ) (37.77%)
38941 branch-misses # 0.22% of all branches ( +- 1.56% ) (46.19%)
81264204 L1-dcache-loads # 684.179 M/sec ( +- 0.25% ) (50.51%)
42555034 L1-dcache-load-misses # 52.37% of all L1-dcache accesses ( +- 0.09% ) (50.51%)
8799361 L1-icache-loads # 74.083 M/sec ( +- 0.36% ) (50.51%)
38984 L1-icache-load-misses # 0.44% of all L1-icache accesses ( +- 2.12% ) (50.51%)
13098 dTLB-loads # 0.110 M/sec ( +- 3.81% ) (50.51%)
6723 dTLB-load-misses # 51.33% of all dTLB cache accesses ( +- 3.99% ) (50.44%)
5 iTLB-loads # 0.040 K/sec ( +- 19.07% ) (45.40%)
112 iTLB-load-misses # 2363.16% of all iTLB cache accesses ( +- 3.90% ) (36.98%)
0.119446 +- 0.000115 seconds time elapsed ( +- 0.10% )
So, about 4x faster for processing data with SoA (a lot less L1 dcache loads and less instructions due to vectorization).
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Does this work?
I assume it does. The reason I would do that is that for a different entity, this could actually be variable, and the calling code doesn't have to know the difference.