Created
July 5, 2018 18:31
-
-
Save juj/e6423dcd3c37470d7e96014d39ba983c to your computer and use it in GitHub Desktop.
Raspberry Pi Zero linear memory block diff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <memory.h> | |
#include <syslog.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <sys/mman.h> | |
#include <errno.h> | |
#include <stdint.h> | |
#include <bcm_host.h> | |
volatile uint64_t *systemTimerRegister = 0; | |
#define tick() (*systemTimerRegister) | |
uint8_t a[320*480*2]; | |
uint8_t b[320*480*2]; | |
int diff(uint8_t *a, uint8_t *b) | |
{ | |
uint8_t *endPtr; | |
asm volatile( | |
"mov r1, %[a]\n" // pointer to buffer 1 to diff | |
"add r0, r1, #307200\n" // end pointer of buffer 1, loop finish condition | |
"mov r2, %[b]\n" // pointer to buffer 2 to diff | |
"start2:\n" | |
"pld [r1, #128]\n" // preload data caches for both buffers 128 bytes ahead of time | |
"pld [r2, #128]\n" | |
"ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements of buffer 1 | |
"ldmia r2!, {r7,r8,r9,r10}\n" // corresponding elements from buffer 2 | |
"cmp r3, r7\n" // compare all elements for diff | |
"bne end2\n" | |
"cmp r4, r8\n" | |
"bne end2\n" | |
"cmp r5, r9\n" | |
"bne end2\n" | |
"cmp r6, r10\n" | |
"bne end2\n" | |
// unroll once to process 8x32bits = 32 bytes = L1 data cache line size of Raspberry Pi Zero in one iteration | |
"ldmia r1!, {r3,r4,r5,r6}\n" | |
"ldmia r2!, {r7,r8,r9,r10}\n" | |
"cmp r3, r7\n" | |
"bne end2\n" | |
"cmp r4, r8\n" | |
"bne end2\n" | |
"cmp r5, r9\n" | |
"bne end2\n" | |
"cmp r6, r10\n" | |
"bne end2\n" | |
"cmp r0, r1\n" // test loop end condition | |
"bne start2\n" | |
"end2:\n" | |
"mov %[result], r1\n\t" | |
: [result]"=r"(endPtr) | |
: [a]"r"(a), [b]"r"(b) | |
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" | |
); | |
return endPtr-a; | |
} | |
#define BCM2835_TIMER_BASE 0x3000 | |
int main() | |
{ | |
int mem_fd = open("/dev/mem", O_RDWR|O_SYNC); | |
if (mem_fd < 0) return -1; | |
void *bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address()); | |
if (bcm2835 == MAP_FAILED) return -1; | |
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine. | |
for(int i = 0; i < 320*480*2; ++i) | |
if (tick() == 0) a[i] = 0; else a[i] = 0xEE; | |
for(int i = 0; i < 320*480*2; ++i) | |
if (tick() == 0) b[i] = 0; else b[i] = 0xEE; | |
uint64_t t0 = tick(); | |
int d = diff(a, b); | |
// diff is coarse, find the exact position outside hot loop | |
if (d != 307200) | |
{ | |
d -= 32; | |
while(a[d] == b[d]) ++d; | |
} | |
uint64_t t1 = tick(); | |
double clocks = (double)(t1-t0)*1000.0; | |
int bytes = sizeof(a); | |
double clocksPerByte = clocks / bytes; | |
printf("bytes diffed: %d, diff at pos: %d, usecs taken: %llu, %f clocks/byte\n", bytes, d, t1-t0, clocksPerByte); | |
} | |
// g++ -I/opt/vc/include -L/opt/vc/lib -lbcm_host -O3 -o test test.cpp | |
// sudo ./test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
bytes diffed: 307200, diff at pos: 307200, usecs taken: 1466, 4.772135 clocks/byte