Skip to content

Instantly share code, notes, and snippets.

@jungin500
Created March 19, 2022 05:54
Show Gist options
  • Save jungin500/e368e1fa6e041124c8d102378ef9c374 to your computer and use it in GitHub Desktop.
Save jungin500/e368e1fa6e041124c8d102378ef9c374 to your computer and use it in GitHub Desktop.
Raspberry Pi 3B 32bit (armv7l) CXX_FLAGS
# OS: Raspbian Buster(10) armv7l on Raspberry Pi 3B (1GB)
# Standard command (-mfpu=neon-fp-armv8 pr -mfpu=neon-vfpv4 is necessary for build!)
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mfpu=neon-vfpv4
# Scalar: 360.64 msec, 0.00 Mcycles
# NEON: 171.59 msec, 0.00 Mcycles
# Standard command with "-mfloat-abi=hard"
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mfpu=neon-vfpv4 -mfloat-abi=hard
# Scalar: 362.39 msec, 0.00 Mcycles
# NEON: 172.54 msec, 0.00 Mcycles
# Extended with mcpu and mtune as shown here:
# https://gist.github.com/fm4dd/c663217935dc17f0fc73c9c81b0aa845
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
-mtune=cortex-a53 -mcpu=cortex-a53 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits
# Scalar: 363.59 msec, 0.00 Mcycles
# NEON: 174.56 msec, 0.00 Mcycles
# More extended with specific arch and operations as shown here:
# https://github.com/superuser789/MediaPipe-on-RaspberryPi
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
-march=armv8-a+crc -mfpu=neon-vfpv4 -mtune=cortex-a53 -ftree-vectorize -mfloat-abi=hard
# Scalar: 360.06 msec, 0.00 Mcycles
# NEON: 172.10 msec, 0.00 Mcycles
# (IMPORTANT) with Optimization Flag:
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
-march=armv8-a+crc -mfpu=neon-vfpv4 -mtune=cortex-a53 -ftree-vectorize -mfloat-abi=hard -O3
# Scalar: 142.38 msec, 0.00 Mcycles
# NEON: 51.86 msec, 0.00 Mcycles
void fun(int COUNT, float* A, float* B, float* C)
{
for (int i=0; i<COUNT; i++)
{
float x = A[i] * B[i];
C[i] = (x + x * x) * x;
}
}
#include <arm_neon.h>
void fun_neon(int COUNT, float* A, float* B, float* C)
{
float32x4_t* A4 = (float32x4_t*)A;
float32x4_t* B4 = (float32x4_t*)B;
float32x4_t* C4 = (float32x4_t*)C;
for (int i=0; i<COUNT/4; i++)
{
float32x4_t x = vmulq_f32(A4[i], B4[i]);
C4[i] = vmulq_f32(x, vmlaq_f32(x, x, x));
}
}
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h> // read, close
#include <sys/syscall.h> // syscall, __NR_perf_even_open
#ifndef __ANDROID__
#include <linux/perf_event.h> // perf_event_attr, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES
#else
// Android NDK doesn't have perf_event.h header :(
// copy&paste from http://lxr.free-electrons.com/source/include/uapi/linux/perf_event.h
enum perf_type_id {
PERF_TYPE_HARDWARE = 0,
PERF_TYPE_SOFTWARE = 1,
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_BREAKPOINT = 5,
PERF_TYPE_MAX,
};
enum perf_hw_id
{
PERF_COUNT_HW_CPU_CYCLES = 0,
PERF_COUNT_HW_INSTRUCTIONS = 1,
PERF_COUNT_HW_CACHE_REFERENCES = 2,
PERF_COUNT_HW_CACHE_MISSES = 3,
PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
PERF_COUNT_HW_BRANCH_MISSES = 5,
PERF_COUNT_HW_BUS_CYCLES = 6,
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7,
PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8,
PERF_COUNT_HW_REF_CPU_CYCLES = 9,
PERF_COUNT_HW_MAX,
};
typedef int32_t __s32;
typedef uint32_t __u32;
typedef uint64_t __u64;
struct perf_event_attr
{
__u32 type;
__u32 size;
__u64 config;
union {
__u64 sample_period;
__u64 sample_freq;
};
__u64 sample_type;
__u64 read_format;
__u64 disabled : 1,
inherit : 1,
pinned : 1,
exclusive : 1,
exclude_user : 1,
exclude_kernel : 1,
exclude_hv : 1,
exclude_idle : 1,
mmap : 1,
comm : 1,
freq : 1,
inherit_stat : 1,
enable_on_exec : 1,
task : 1,
watermark : 1,
precise_ip : 2,
mmap_data : 1,
sample_id_all : 1,
exclude_host : 1,
exclude_guest : 1,
exclude_callchain_kernel : 1,
exclude_callchain_user : 1,
mmap2 : 1,
comm_exec : 1,
use_clockid : 1,
__reserved_1 : 38;
union {
__u32 wakeup_events;
__u32 wakeup_watermark;
};
__u32 bp_type;
union {
__u64 bp_addr;
__u64 config1;
};
union {
__u64 bp_len;
__u64 config2;
};
__u64 branch_sample_type;
__u64 sample_regs_user;
__u32 sample_stack_user;
__s32 clockid;
__u64 sample_regs_intr;
__u32 aux_watermark;
__u32 __reserved_2;
};
#endif
static uint64_t get_ticks()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}
static int fd;
static uint64_t get_cycles()
{
uint64_t res;
if (read(fd, &res, sizeof(res)) < (ssize_t)sizeof(res)) return 0;
return res;
}
void fun(int COUNT, float* A, float* B, float* C);
void fun_neon(int COUNT, float* A, float* B, float* C);
int main()
{
struct perf_event_attr attr = {};
attr.type = PERF_TYPE_HARDWARE;
attr.config = PERF_COUNT_HW_CPU_CYCLES;
fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
const int COUNT = 8 * 1024*1024; // 8 MB
static float A[COUNT] __attribute__((aligned(16)));
static float B[COUNT] __attribute__((aligned(16)));
static float C[COUNT] __attribute__((aligned(16)));
for (int i=0; i<COUNT; i++)
{
A[i] = (float)rand()/(float)RAND_MAX;
B[i] = (float)rand()/(float)RAND_MAX;
}
uint64_t t1, t2, c1, c2;
{
fun(COUNT, A, B, C);
t1 = get_ticks();
c1 = get_cycles();
fun(COUNT, A, B, C);
c2 = get_cycles();
t2 = get_ticks();
printf("Scalar: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f);
}
{
fun_neon(COUNT, A, B, C);
t1 = get_ticks();
c1 = get_cycles();
fun_neon(COUNT, A, B, C);
c2 = get_cycles();
t2 = get_ticks();
printf("NEON: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f);
}
close(fd);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment