Created
March 19, 2022 05:44
-
-
Save jungin500/8b827d6a3874638001c3f733bb4c8051 to your computer and use it in GitHub Desktop.
Raspberry Pi 4B 64bit (aarch64) CXX_FLAGS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# OS: Ubuntu Server 20.04 aarch64 on Raspberry Pi 4B (4GB) | |
# Standard command | |
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp | |
# Scalar: 131.32 msec, 0.00 Mcycles | |
# NEON: 72.39 msec, 0.00 Mcycles | |
# Extended with march and mtune as shown here: | |
# https://gist.github.com/fm4dd/c663217935dc17f0fc73c9c81b0aa845 | |
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mtune=cortex-a72 -mcpu=cortex-a72 | |
# Scalar: 130.91 msec, 0.00 Mcycles | |
# NEON: 71.79 msec, 0.00 Mcycles | |
# More extended with specific arch and operations as shown here: | |
# https://github.com/superuser789/MediaPipe-on-RaspberryPi | |
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -march=armv8-a+crc+simd -mcpu=cortex-a72 -mtune=cortex-a72 | |
# Scalar: 132.33 msec, 0.00 Mcycles | |
# NEON: 71.44 msec, 0.00 Mcycles | |
# (IMPORTANT) with Optimization Flag: | |
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -march=armv8-a+crc+simd -mcpu=cortex-a72 -mtune=cortex-a72 -O3 | |
# Scalar: 21.67 msec, 0.00 Mcycles | |
# NEON: 21.61 msec, 0.00 Mcycles |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void fun(int COUNT, float* A, float* B, float* C) | |
{ | |
for (int i=0; i<COUNT; i++) | |
{ | |
float x = A[i] * B[i]; | |
C[i] = (x + x * x) * x; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <arm_neon.h> | |
void fun_neon(int COUNT, float* A, float* B, float* C) | |
{ | |
float32x4_t* A4 = (float32x4_t*)A; | |
float32x4_t* B4 = (float32x4_t*)B; | |
float32x4_t* C4 = (float32x4_t*)C; | |
for (int i=0; i<COUNT/4; i++) | |
{ | |
float32x4_t x = vmulq_f32(A4[i], B4[i]); | |
C4[i] = vmulq_f32(x, vmlaq_f32(x, x, x)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#include <unistd.h> // read, close | |
#include <sys/syscall.h> // syscall, __NR_perf_even_open | |
#ifndef __ANDROID__ | |
#include <linux/perf_event.h> // perf_event_attr, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES | |
#else | |
// Android NDK doesn't have perf_event.h header :( | |
// copy&paste from http://lxr.free-electrons.com/source/include/uapi/linux/perf_event.h | |
enum perf_type_id { | |
PERF_TYPE_HARDWARE = 0, | |
PERF_TYPE_SOFTWARE = 1, | |
PERF_TYPE_TRACEPOINT = 2, | |
PERF_TYPE_HW_CACHE = 3, | |
PERF_TYPE_RAW = 4, | |
PERF_TYPE_BREAKPOINT = 5, | |
PERF_TYPE_MAX, | |
}; | |
enum perf_hw_id | |
{ | |
PERF_COUNT_HW_CPU_CYCLES = 0, | |
PERF_COUNT_HW_INSTRUCTIONS = 1, | |
PERF_COUNT_HW_CACHE_REFERENCES = 2, | |
PERF_COUNT_HW_CACHE_MISSES = 3, | |
PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, | |
PERF_COUNT_HW_BRANCH_MISSES = 5, | |
PERF_COUNT_HW_BUS_CYCLES = 6, | |
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, | |
PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, | |
PERF_COUNT_HW_REF_CPU_CYCLES = 9, | |
PERF_COUNT_HW_MAX, | |
}; | |
typedef int32_t __s32; | |
typedef uint32_t __u32; | |
typedef uint64_t __u64; | |
struct perf_event_attr | |
{ | |
__u32 type; | |
__u32 size; | |
__u64 config; | |
union { | |
__u64 sample_period; | |
__u64 sample_freq; | |
}; | |
__u64 sample_type; | |
__u64 read_format; | |
__u64 disabled : 1, | |
inherit : 1, | |
pinned : 1, | |
exclusive : 1, | |
exclude_user : 1, | |
exclude_kernel : 1, | |
exclude_hv : 1, | |
exclude_idle : 1, | |
mmap : 1, | |
comm : 1, | |
freq : 1, | |
inherit_stat : 1, | |
enable_on_exec : 1, | |
task : 1, | |
watermark : 1, | |
precise_ip : 2, | |
mmap_data : 1, | |
sample_id_all : 1, | |
exclude_host : 1, | |
exclude_guest : 1, | |
exclude_callchain_kernel : 1, | |
exclude_callchain_user : 1, | |
mmap2 : 1, | |
comm_exec : 1, | |
use_clockid : 1, | |
__reserved_1 : 38; | |
union { | |
__u32 wakeup_events; | |
__u32 wakeup_watermark; | |
}; | |
__u32 bp_type; | |
union { | |
__u64 bp_addr; | |
__u64 config1; | |
}; | |
union { | |
__u64 bp_len; | |
__u64 config2; | |
}; | |
__u64 branch_sample_type; | |
__u64 sample_regs_user; | |
__u32 sample_stack_user; | |
__s32 clockid; | |
__u64 sample_regs_intr; | |
__u32 aux_watermark; | |
__u32 __reserved_2; | |
}; | |
#endif | |
static uint64_t get_ticks() | |
{ | |
struct timespec ts; | |
clock_gettime(CLOCK_MONOTONIC, &ts); | |
return ts.tv_sec * 1000000000ULL + ts.tv_nsec; | |
} | |
static int fd; | |
static uint64_t get_cycles() | |
{ | |
uint64_t res; | |
if (read(fd, &res, sizeof(res)) < (ssize_t)sizeof(res)) return 0; | |
return res; | |
} | |
void fun(int COUNT, float* A, float* B, float* C); | |
void fun_neon(int COUNT, float* A, float* B, float* C); | |
int main() | |
{ | |
struct perf_event_attr attr = {}; | |
attr.type = PERF_TYPE_HARDWARE; | |
attr.config = PERF_COUNT_HW_CPU_CYCLES; | |
fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); | |
const int COUNT = 8 * 1024*1024; // 8 MB | |
static float A[COUNT] __attribute__((aligned(16))); | |
static float B[COUNT] __attribute__((aligned(16))); | |
static float C[COUNT] __attribute__((aligned(16))); | |
for (int i=0; i<COUNT; i++) | |
{ | |
A[i] = (float)rand()/(float)RAND_MAX; | |
B[i] = (float)rand()/(float)RAND_MAX; | |
} | |
uint64_t t1, t2, c1, c2; | |
{ | |
fun(COUNT, A, B, C); | |
t1 = get_ticks(); | |
c1 = get_cycles(); | |
fun(COUNT, A, B, C); | |
c2 = get_cycles(); | |
t2 = get_ticks(); | |
printf("Scalar: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f); | |
} | |
{ | |
fun_neon(COUNT, A, B, C); | |
t1 = get_ticks(); | |
c1 = get_cycles(); | |
fun_neon(COUNT, A, B, C); | |
c2 = get_cycles(); | |
t2 = get_ticks(); | |
printf("NEON: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f); | |
} | |
close(fd); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment