jungin500/build.sh

## build.sh
# OS: Raspbian Buster(10) armv7l on Raspberry Pi 3B (1GB)

# Standard command (-mfpu=neon-fp-armv8 pr -mfpu=neon-vfpv4 is necessary for build!)
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mfpu=neon-vfpv4
# Scalar: 360.64 msec, 0.00 Mcycles
# NEON: 171.59 msec, 0.00 Mcycles

# Standard command  with "-mfloat-abi=hard"
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mfpu=neon-vfpv4 -mfloat-abi=hard
# Scalar: 362.39 msec, 0.00 Mcycles
# NEON: 172.54 msec, 0.00 Mcycles

# Extended with mcpu and mtune as shown here:
#   https://gist.github.com/fm4dd/c663217935dc17f0fc73c9c81b0aa845
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
    -mtune=cortex-a53 -mcpu=cortex-a53 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits
# Scalar: 363.59 msec, 0.00 Mcycles
# NEON: 174.56 msec, 0.00 Mcycles

# More extended with specific arch and operations as shown here:
#   https://github.com/superuser789/MediaPipe-on-RaspberryPi
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
    -march=armv8-a+crc -mfpu=neon-vfpv4 -mtune=cortex-a53 -ftree-vectorize -mfloat-abi=hard
# Scalar: 360.06 msec, 0.00 Mcycles
# NEON: 172.10 msec, 0.00 Mcycles

# (IMPORTANT) with Optimization Flag:
g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
    -march=armv8-a+crc -mfpu=neon-vfpv4 -mtune=cortex-a53 -ftree-vectorize -mfloat-abi=hard -O3
# Scalar: 142.38 msec, 0.00 Mcycles
# NEON: 51.86 msec, 0.00 Mcycles

## fun.cpp
void fun(int COUNT, float* A, float* B, float* C)
{
    for (int i=0; i<COUNT; i++)
    {
        float x = A[i] * B[i];
        C[i] = (x + x * x) * x;
    }
}

## fun_neon.cpp
#include <arm_neon.h>

void fun_neon(int COUNT, float* A, float* B, float* C)
{
    float32x4_t* A4 = (float32x4_t*)A;
    float32x4_t* B4 = (float32x4_t*)B;
    float32x4_t* C4 = (float32x4_t*)C;

    for (int i=0; i<COUNT/4; i++)
    {
        float32x4_t x = vmulq_f32(A4[i], B4[i]);
        C4[i] = vmulq_f32(x, vmlaq_f32(x, x, x));
    }
}

## main.cpp
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>           // read, close
#include <sys/syscall.h>      // syscall, __NR_perf_even_open

#ifndef __ANDROID__
#include <linux/perf_event.h> // perf_event_attr, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES
#else
// Android NDK doesn't have perf_event.h header :(
// copy&paste from http://lxr.free-electrons.com/source/include/uapi/linux/perf_event.h

enum perf_type_id {
        PERF_TYPE_HARDWARE                      = 0,
        PERF_TYPE_SOFTWARE                      = 1,
        PERF_TYPE_TRACEPOINT                    = 2,
        PERF_TYPE_HW_CACHE                      = 3,
        PERF_TYPE_RAW                           = 4,
        PERF_TYPE_BREAKPOINT                    = 5,

        PERF_TYPE_MAX,
};

enum perf_hw_id
{
        PERF_COUNT_HW_CPU_CYCLES                = 0,
        PERF_COUNT_HW_INSTRUCTIONS              = 1,
        PERF_COUNT_HW_CACHE_REFERENCES          = 2,
        PERF_COUNT_HW_CACHE_MISSES              = 3,
        PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
        PERF_COUNT_HW_BRANCH_MISSES             = 5,
        PERF_COUNT_HW_BUS_CYCLES                = 6,
        PERF_COUNT_HW_STALLED_CYCLES_FRONTEND   = 7,
        PERF_COUNT_HW_STALLED_CYCLES_BACKEND    = 8,
        PERF_COUNT_HW_REF_CPU_CYCLES            = 9,

        PERF_COUNT_HW_MAX,
};

typedef int32_t __s32;
typedef uint32_t __u32;
typedef uint64_t __u64;

struct perf_event_attr
{
        __u32                   type;
        __u32                   size;
        __u64                   config;
        union {
                __u64           sample_period;
                __u64           sample_freq;
        };
        __u64                   sample_type;
        __u64                   read_format;
        __u64                   disabled       :  1,
                                inherit        :  1,
                                pinned         :  1,
                                exclusive      :  1,
                                exclude_user   :  1,
                                exclude_kernel :  1,
                                exclude_hv     :  1,
                                exclude_idle   :  1,
                                mmap           :  1,
                                comm           :  1,
                                freq           :  1,
                                inherit_stat   :  1,
                                enable_on_exec :  1,
                                task           :  1,
                                watermark      :  1,
                                precise_ip     :  2,
                                mmap_data      :  1,
                                sample_id_all  :  1,
                                exclude_host   :  1,
                                exclude_guest  :  1,
                                exclude_callchain_kernel : 1,
                                exclude_callchain_user   : 1,
                                mmap2          :  1,
                                comm_exec      :  1,
                                use_clockid    :  1,
                                __reserved_1   : 38;
        union {
                __u32           wakeup_events;
                __u32           wakeup_watermark;
        };
        __u32                   bp_type;
        union {
                __u64           bp_addr;
                __u64           config1;
        };
        union {
                __u64           bp_len;
                __u64           config2;
        };
        __u64   branch_sample_type;
        __u64   sample_regs_user;
        __u32   sample_stack_user;
        __s32   clockid;
        __u64   sample_regs_intr;
        __u32   aux_watermark;
        __u32   __reserved_2;
};
#endif

static uint64_t get_ticks()
{
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

static int fd;

static uint64_t get_cycles()
{
    uint64_t res;
    if (read(fd, &res, sizeof(res)) < (ssize_t)sizeof(res)) return 0;
    return res;
}

void fun(int COUNT, float* A, float* B, float* C);
void fun_neon(int COUNT, float* A, float* B, float* C);

int main()
{
    struct perf_event_attr attr = {};
    attr.type = PERF_TYPE_HARDWARE;
    attr.config = PERF_COUNT_HW_CPU_CYCLES;
    fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);

    const int COUNT = 8 * 1024*1024; // 8 MB

    static float A[COUNT] __attribute__((aligned(16)));
    static float B[COUNT] __attribute__((aligned(16)));
    static float C[COUNT] __attribute__((aligned(16)));

    for (int i=0; i<COUNT; i++)
    {
        A[i] = (float)rand()/(float)RAND_MAX;
        B[i] = (float)rand()/(float)RAND_MAX;
    }

    uint64_t t1, t2, c1, c2;

    {
        fun(COUNT, A, B, C);

        t1 = get_ticks();
        c1 = get_cycles();

        fun(COUNT, A, B, C);

        c2 = get_cycles();
        t2 = get_ticks();

        printf("Scalar: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f);
    }

    {
        fun_neon(COUNT, A, B, C);

        t1 = get_ticks();
        c1 = get_cycles();

        fun_neon(COUNT, A, B, C);

        c2 = get_cycles();
        t2 = get_ticks();

        printf("NEON: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f);
    }

    close(fd);
}
	# OS: Raspbian Buster(10) armv7l on Raspberry Pi 3B (1GB)

	# Standard command (-mfpu=neon-fp-armv8 pr -mfpu=neon-vfpv4 is necessary for build!)
	g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mfpu=neon-vfpv4
	# Scalar: 360.64 msec, 0.00 Mcycles
	# NEON: 171.59 msec, 0.00 Mcycles

	# Standard command with "-mfloat-abi=hard"
	g++ -o benchmark main.cpp fun_neon.cpp fun.cpp -mfpu=neon-vfpv4 -mfloat-abi=hard
	# Scalar: 362.39 msec, 0.00 Mcycles
	# NEON: 172.54 msec, 0.00 Mcycles

	# Extended with mcpu and mtune as shown here:
	# https://gist.github.com/fm4dd/c663217935dc17f0fc73c9c81b0aa845
	g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
	-mtune=cortex-a53 -mcpu=cortex-a53 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mneon-for-64bits
	# Scalar: 363.59 msec, 0.00 Mcycles
	# NEON: 174.56 msec, 0.00 Mcycles

	# More extended with specific arch and operations as shown here:
	# https://github.com/superuser789/MediaPipe-on-RaspberryPi
	g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
	-march=armv8-a+crc -mfpu=neon-vfpv4 -mtune=cortex-a53 -ftree-vectorize -mfloat-abi=hard
	# Scalar: 360.06 msec, 0.00 Mcycles
	# NEON: 172.10 msec, 0.00 Mcycles

	# (IMPORTANT) with Optimization Flag:
	g++ -o benchmark main.cpp fun_neon.cpp fun.cpp \
	-march=armv8-a+crc -mfpu=neon-vfpv4 -mtune=cortex-a53 -ftree-vectorize -mfloat-abi=hard -O3
	# Scalar: 142.38 msec, 0.00 Mcycles
	# NEON: 51.86 msec, 0.00 Mcycles
	void fun(int COUNT, float* A, float* B, float* C)
	{
	for (int i=0; i<COUNT; i++)
	{
	float x = A[i] * B[i];
	C[i] = (x + x * x) * x;
	}
	}
	#include <arm_neon.h>

	void fun_neon(int COUNT, float* A, float* B, float* C)
	{
	float32x4_t* A4 = (float32x4_t*)A;
	float32x4_t* B4 = (float32x4_t*)B;
	float32x4_t* C4 = (float32x4_t*)C;

	for (int i=0; i<COUNT/4; i++)
	{
	float32x4_t x = vmulq_f32(A4[i], B4[i]);
	C4[i] = vmulq_f32(x, vmlaq_f32(x, x, x));
	}
	}
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>
	#include <unistd.h> // read, close
	#include <sys/syscall.h> // syscall, __NR_perf_even_open

	#ifndef __ANDROID__
	#include <linux/perf_event.h> // perf_event_attr, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES
	#else
	// Android NDK doesn't have perf_event.h header :(
	// copy&paste from http://lxr.free-electrons.com/source/include/uapi/linux/perf_event.h

	enum perf_type_id {
	PERF_TYPE_HARDWARE = 0,
	PERF_TYPE_SOFTWARE = 1,
	PERF_TYPE_TRACEPOINT = 2,
	PERF_TYPE_HW_CACHE = 3,
	PERF_TYPE_RAW = 4,
	PERF_TYPE_BREAKPOINT = 5,

	PERF_TYPE_MAX,
	};

	enum perf_hw_id
	{
	PERF_COUNT_HW_CPU_CYCLES = 0,
	PERF_COUNT_HW_INSTRUCTIONS = 1,
	PERF_COUNT_HW_CACHE_REFERENCES = 2,
	PERF_COUNT_HW_CACHE_MISSES = 3,
	PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
	PERF_COUNT_HW_BRANCH_MISSES = 5,
	PERF_COUNT_HW_BUS_CYCLES = 6,
	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7,
	PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8,
	PERF_COUNT_HW_REF_CPU_CYCLES = 9,

	PERF_COUNT_HW_MAX,
	};

	typedef int32_t __s32;
	typedef uint32_t __u32;
	typedef uint64_t __u64;

	struct perf_event_attr
	{
	__u32 type;
	__u32 size;
	__u64 config;
	union {
	__u64 sample_period;
	__u64 sample_freq;
	};
	__u64 sample_type;
	__u64 read_format;
	__u64 disabled : 1,
	inherit : 1,
	pinned : 1,
	exclusive : 1,
	exclude_user : 1,
	exclude_kernel : 1,
	exclude_hv : 1,
	exclude_idle : 1,
	mmap : 1,
	comm : 1,
	freq : 1,
	inherit_stat : 1,
	enable_on_exec : 1,
	task : 1,
	watermark : 1,
	precise_ip : 2,
	mmap_data : 1,
	sample_id_all : 1,
	exclude_host : 1,
	exclude_guest : 1,
	exclude_callchain_kernel : 1,
	exclude_callchain_user : 1,
	mmap2 : 1,
	comm_exec : 1,
	use_clockid : 1,
	__reserved_1 : 38;
	union {
	__u32 wakeup_events;
	__u32 wakeup_watermark;
	};
	__u32 bp_type;
	union {
	__u64 bp_addr;
	__u64 config1;
	};
	union {
	__u64 bp_len;
	__u64 config2;
	};
	__u64 branch_sample_type;
	__u64 sample_regs_user;
	__u32 sample_stack_user;
	__s32 clockid;
	__u64 sample_regs_intr;
	__u32 aux_watermark;
	__u32 __reserved_2;
	};
	#endif

	static uint64_t get_ticks()
	{
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
	}

	static int fd;

	static uint64_t get_cycles()
	{
	uint64_t res;
	if (read(fd, &res, sizeof(res)) < (ssize_t)sizeof(res)) return 0;
	return res;
	}

	void fun(int COUNT, float* A, float* B, float* C);
	void fun_neon(int COUNT, float* A, float* B, float* C);

	int main()
	{
	struct perf_event_attr attr = {};
	attr.type = PERF_TYPE_HARDWARE;
	attr.config = PERF_COUNT_HW_CPU_CYCLES;
	fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);

	const int COUNT = 8 * 1024*1024; // 8 MB

	static float A[COUNT] __attribute__((aligned(16)));
	static float B[COUNT] __attribute__((aligned(16)));
	static float C[COUNT] __attribute__((aligned(16)));

	for (int i=0; i<COUNT; i++)
	{
	A[i] = (float)rand()/(float)RAND_MAX;
	B[i] = (float)rand()/(float)RAND_MAX;
	}

	uint64_t t1, t2, c1, c2;

	{
	fun(COUNT, A, B, C);

	t1 = get_ticks();
	c1 = get_cycles();

	fun(COUNT, A, B, C);

	c2 = get_cycles();
	t2 = get_ticks();

	printf("Scalar: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f);
	}

	{
	fun_neon(COUNT, A, B, C);

	t1 = get_ticks();
	c1 = get_cycles();

	fun_neon(COUNT, A, B, C);

	c2 = get_cycles();
	t2 = get_ticks();

	printf("NEON: %.2f msec, %.2f Mcycles\n", (float)(t2 - t1) / 1e6f, (float)(c2 - c1) / 1e6f);
	}

	close(fd);
	}