Skip to content

Instantly share code, notes, and snippets.

@amadio
Last active February 5, 2021 08:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amadio/69c8524af31e50ff2e717ba3137bf005 to your computer and use it in GitHub Desktop.
Save amadio/69c8524af31e50ff2e717ba3137bf005 to your computer and use it in GitHub Desktop.
Example of how to handle data on CPU/GPU using SoA layout to obtain good performance
#include <cstdio>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <typeinfo>
#include <utility>
#ifndef __CUDACC__
# define __host__
# define __device__
# define __global__
#endif
struct track {
/* event structure */
int32_t id;
int32_t parent;
/* geometry data */
float x;
float y;
float z;
int32_t geometry_id;
/* physics data */
float vx;
float vy;
float vz;
float E;
int32_t material_id;
/* time */
float global_time;
float proper_time;
/* generic 12 bytes cache to be shared between systems */
char cache[12];
/* rng state (64 bytes) */
char state[64];
static constexpr float mass = 0.511;
};
// With sizeof(track) == 128, we have:
// 4KB pages with 32 tracks per page
// 2MB blocks with 512 pages of 4KB (16384 tracks per block)
// 1GB slabs with 512 blocks of 2MB (8388608 tracks per slab)
struct slab {
static constexpr int tracks_per_page = 32;
static constexpr int pages_per_block = 512;
static constexpr int blocks_per_slab = 512;
struct track tracks[blocks_per_slab][pages_per_block][tracks_per_page];
};
/* Below is a simple example function we'd like to reuse across CPU/GPU */
__host__ __device__
float kinetic_energy(float mass, float vx, float vy, float vz)
{
return 0.5 * mass * (vx*vx + vy*vy + vz*vz);
}
/* This is how to process all tracks in the native AoS layout on CPU */
void update_kinetic_energy_cpu_aos(struct slab* s)
{
#pragma omp parallel for
for (int i = 0; i < slab::blocks_per_slab; ++i)
for (int j = 0; j < slab::pages_per_block; ++j)
for (track& t : s->tracks[i][j])
t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz);
}
/* This is how to process all tracks in the native AoS layout on GPU */
#ifdef __CUDACC__
__global__ void update_kintetic_energy_gpu_aos(struct slab* s)
{
track& t = s->tracks[blockIdx.x][threadIdx.y][threadIdx.x];
t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz);
}
// How to call the kernel above:
// dim3 blocks(slab::blocks_per_slab);
// dim3 threads(slab::pages_per_block, slab::tracks_per_page);
// update_kinetic_energy_gpu_aos<<<blocks, threads>>>(s);
#endif
/* These are auxiliary macros to help reinterpret a page of tracks as if it were in SoA format */
#define get_component_type(var, member) \
decltype(std::remove_pointer<decltype(var)>::type::member)
#define get_component_offset(var, member, n) \
((char*)var + n * offsetof(std::remove_pointer<decltype(var)>::type, member))
#define get_component(var, member, n) \
(get_component_type(var, member)*) get_component_offset(var, member, n)
/* This is how to process all tracks in SoA layout on CPU */
void update_kinetic_energy_cpu_soa(struct slab* s)
{
#pragma omp parallel for
for (int i = 0; i < slab::blocks_per_slab; ++i) {
for (int j = 0; j < slab::pages_per_block; ++j) {
track* page = s->tracks[i][j];
auto vx = get_component(page, vx, slab::tracks_per_page);
auto vy = get_component(page, vy, slab::tracks_per_page);
auto vz = get_component(page, vz, slab::tracks_per_page);
auto E = get_component(page, E, slab::tracks_per_page);
for (int i = 0; i < slab::tracks_per_page; ++i)
E[i] = kinetic_energy(track::mass, vx[i], vy[i], vz[i]);
}
}
}
/* This is how to process all tracks in SoA layout on GPU */
#ifdef __CUDACC__
__global__ void update_kinetic_energy_gpu_soa(struct slab* s)
{
track* page = s->tracks[blockIdx.x][threadIdx.y];
auto vx = get_component(page, vx, blockDim.x);
auto vy = get_component(page, vy, blockDim.x);
auto vz = get_component(page, vz, blockDim.x);
auto E = get_component(page, E, blockDim.x);
auto i = threadIdx.x;
E[i] = kinetic_energy(track::mass, vx[i], vy[i], vz[i]);
}
// How to call the kernel above:
// dim3 blocks(slab::blocks_per_slab);
// dim3 threads(slab::pages_per_block, slab::tracks_per_page);
// compute_energy_gpu_soa<<<blocks, threads>>>(s);
#endif
int main(int argc, char **argv)
{
slab *s = new slab();
if (argc < 2) {
printf("Usage: %s [aos|soa]\n", argv[0]);
return 0;
}
while (argc > 1) {
if (strcmp(argv[--argc], "aos") == 0)
update_kinetic_energy_cpu_aos(s);
else if (strcmp(argv[--argc], "soa") == 0)
update_kinetic_energy_cpu_soa(s);
}
delete s;
return 0;
}
@hageboeck
Copy link

Does this work?

-	t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz);
+	t.E = kinetic_energy(t.mass, t.vx, t.vy, t.vz);

I assume it does. The reason I would do that is that for a different entity, this could actually be variable, and the calling code doesn't have to know the difference.

@amadio
Copy link
Author

amadio commented Feb 4, 2021

Does this work?

-	t.E = kinetic_energy(track::mass, t.vx, t.vy, t.vz);
+	t.E = kinetic_energy(t.mass, t.vx, t.vy, t.vz);

I assume it does. The reason I would do that is that for a different entity, this could actually be variable, and the calling code doesn't have to know the difference.

No, it doesn't because mass is a static member of track.

@amadio
Copy link
Author

amadio commented Feb 4, 2021

This is a measurement of performance on my machine (AMD Ryzen 3950X):

 Performance counter stats for 'AoS' (20 runs):

            483.46 msec task-clock                #    0.999 CPUs utilized            ( +-  0.08% )
                 1      context-switches          #    0.001 K/sec                    ( +- 20.99% )
                 0      cpu-migrations            #    0.000 K/sec                  
              1138      page-faults               #    0.002 M/sec                    ( +-  0.02% )
        1687395641      cycles                    #    3.490 GHz                      ( +-  0.08% )  (38.19%)
         106037669      stalled-cycles-frontend   #    6.28% frontend cycles idle     ( +-  1.62% )  (40.26%)
        1283753418      stalled-cycles-backend    #   76.08% backend cycles idle      ( +-  0.30% )  (42.32%)
        1996927212      instructions              #    1.18  insn per cycle         
                                                  #    0.64  stalled cycles per insn  ( +-  0.25% )  (44.39%)
          19497066      branches                  #   40.328 M/sec                    ( +-  0.71% )  (46.46%)
             40196      branch-misses             #    0.21% of all branches          ( +-  0.53% )  (48.53%)
         622021303      L1-dcache-loads           # 1286.609 M/sec                    ( +-  0.09% )  (49.17%)
         123124358      L1-dcache-load-misses     #   19.79% of all L1-dcache accesses  ( +-  0.10% )  (47.33%)
          42493039      L1-icache-loads           #   87.894 M/sec                    ( +-  0.24% )  (45.27%)
            149328      L1-icache-load-misses     #    0.35% of all L1-icache accesses  ( +-  0.47% )  (43.20%)
             31003      dTLB-loads                #    0.064 M/sec                    ( +-  1.87% )  (41.13%)
              8598      dTLB-load-misses          #   27.73% of all dTLB cache accesses  ( +-  3.06% )  (39.06%)
                26      iTLB-loads                #    0.055 K/sec                    ( +- 43.50% )  (37.46%)
                19      iTLB-load-misses          #   72.30% of all iTLB cache accesses  ( +- 54.59% )  (37.23%)

          0.484160 +- 0.000367 seconds time elapsed  ( +-  0.08% )

 Performance counter stats for 'SoA' (20 runs):

            118.78 msec task-clock                #    0.994 CPUs utilized            ( +-  0.10% )
                 1      context-switches          #    0.010 K/sec                    ( +-  9.75% )
                 0      cpu-migrations            #    0.000 K/sec                  
              1138      page-faults               #    0.010 M/sec                    ( +-  0.02% )
         414521761      cycles                    #    3.490 GHz                      ( +-  0.10% )  (32.65%)
         131647110      stalled-cycles-frontend   #   31.76% frontend cycles idle     ( +-  2.02% )  (32.65%)
         182606710      stalled-cycles-backend    #   44.05% backend cycles idle      ( +-  3.40% )  (32.66%)
         122945544      instructions              #    0.30  insn per cycle         
                                                  #    1.49  stalled cycles per insn  ( +-  1.73% )  (32.73%)
          17639259      branches                  #  148.508 M/sec                    ( +-  0.36% )  (37.77%)
             38941      branch-misses             #    0.22% of all branches          ( +-  1.56% )  (46.19%)
          81264204      L1-dcache-loads           #  684.179 M/sec                    ( +-  0.25% )  (50.51%)
          42555034      L1-dcache-load-misses     #   52.37% of all L1-dcache accesses  ( +-  0.09% )  (50.51%)
           8799361      L1-icache-loads           #   74.083 M/sec                    ( +-  0.36% )  (50.51%)
             38984      L1-icache-load-misses     #    0.44% of all L1-icache accesses  ( +-  2.12% )  (50.51%)
             13098      dTLB-loads                #    0.110 M/sec                    ( +-  3.81% )  (50.51%)
              6723      dTLB-load-misses          #   51.33% of all dTLB cache accesses  ( +-  3.99% )  (50.44%)
                 5      iTLB-loads                #    0.040 K/sec                    ( +- 19.07% )  (45.40%)
               112      iTLB-load-misses          # 2363.16% of all iTLB cache accesses  ( +-  3.90% )  (36.98%)

          0.119446 +- 0.000115 seconds time elapsed  ( +-  0.10% )

So, about 4x faster for processing data with SoA (a lot less L1 dcache loads and less instructions due to vectorization).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment