Created
July 8, 2018 21:03
-
-
Save juj/e33338d2630bdfcef8c1e35b05c7be3b to your computer and use it in GitHub Desktop.
Raspberry Pi Zero linear memory block diff version 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <memory.h> | |
#include <syslog.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <sys/mman.h> | |
#include <errno.h> | |
#include <stdint.h> | |
#include <bcm_host.h> | |
volatile uint64_t *systemTimerRegister = 0; | |
#define tick() (*systemTimerRegister) | |
uint8_t __attribute__((aligned(32))) a[320*480*2]; | |
uint8_t __attribute__((aligned(32))) b[320*480*2]; | |
int diff(uint8_t *a, uint8_t *b) | |
{ | |
uint8_t *endPtr; | |
asm volatile( | |
"mov r0, %[a]\n" // pointer to buffer 1 to diff | |
"add r10, r0, #307200\n" // end pointer of buffer 1, loop finish condition | |
"mov r1, %[b]\n" // pointer to buffer 2 to diff | |
"start_%=:\n" | |
"ldmia r0!, {r2,r3,r4,r5}\n" // load 4x32-bit elements of buffer 1 | |
"ldmia r1!, {r6,r7,r8,r9}\n" // corresponding elements from buffer 2 | |
"pld [r0, #240]\n" // preload data caches for both buffers 128 bytes ahead of time | |
"cmp r2, r6\n" // compare all elements for diff | |
"cmpeq r3, r7\n" | |
"cmpeq r4, r8\n" | |
"cmpeq r5, r9\n" | |
// unroll once to process 8x32bits = 32 bytes = L1 data cache line size of Raspberry Pi Zero in one iteration | |
"ldmia r1!, {r6,r7,r8,r9}\n" | |
"ldmia r0!, {r2,r3,r4,r5}\n" | |
"pld [r1, #224]\n" | |
"cmpeq r2, r6\n" | |
"cmpeq r3, r7\n" | |
"cmpeq r4, r8\n" | |
"cmpeq r5, r9\n" | |
"bne end_%=\n" | |
"cmp r0, r10\n" // test loop end condition | |
"blo start_%=\n" | |
"end_%=:\n" | |
"mov %[result], r0\n\t" | |
: [result]"=r"(endPtr) | |
: [a]"r"(a), [b]"r"(b) | |
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" | |
); | |
return endPtr-a; | |
} | |
#define BCM2835_TIMER_BASE 0x3000 | |
int main() | |
{ | |
int mem_fd = open("/dev/mem", O_RDWR|O_SYNC); | |
void *bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address()); | |
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine. | |
for(int x = 0; x < 32; ++x) // 32 test cases | |
{ | |
for(int i = 0; i < 320*480*2; ++i) if (tick() == 0) a[i] = 0; else a[i] = 0xEE; | |
for(int i = 0; i < 320*480*2; ++i) if (tick() == 0) b[i] = 0; else b[i] = 0xEE; | |
a[320*480*2-1-x] = 0x10; // Introduce an actual diff towards the very end | |
uint64_t t0 = tick(); | |
int d = diff(a, b); | |
int e = d; | |
// diff is coarse, find the exact position outside hot loop | |
if (e <= 307200) | |
{ | |
e -= 32; | |
while(a[e] == b[e] && e < 307200) ++e; | |
} | |
uint64_t t1 = tick(); | |
double clocks = (double)(t1-t0)*1000.0; | |
if (clocks == 0) clocks = 1; | |
int bytes = sizeof(a); | |
double clocksPerByte = clocks / bytes; | |
printf("bytes diffed: %d, diff at pos: %d (exact: %d), usecs taken: %llu, %f clocks/byte, %f clocks/32b (%f MB/sec)\n", bytes, d, e, t1-t0, clocksPerByte, clocksPerByte*32.0, 1e9/clocksPerByte / 1000000.0); | |
} | |
} | |
// g++ -I/opt/vc/include -L/opt/vc/lib -lbcm_host -O3 -o test test.cpp | |
// sudo ./test |
Author
juj
commented
Jul 8, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment