Skip to content

Instantly share code, notes, and snippets.

@juj
Created July 5, 2018 18:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save juj/e6423dcd3c37470d7e96014d39ba983c to your computer and use it in GitHub Desktop.
Save juj/e6423dcd3c37470d7e96014d39ba983c to your computer and use it in GitHub Desktop.
Raspberry Pi Zero linear memory block diff
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <syslog.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdint.h>
#include <bcm_host.h>
volatile uint64_t *systemTimerRegister = 0;
#define tick() (*systemTimerRegister)
uint8_t a[320*480*2];
uint8_t b[320*480*2];
int diff(uint8_t *a, uint8_t *b)
{
uint8_t *endPtr;
asm volatile(
"mov r1, %[a]\n" // pointer to buffer 1 to diff
"add r0, r1, #307200\n" // end pointer of buffer 1, loop finish condition
"mov r2, %[b]\n" // pointer to buffer 2 to diff
"start2:\n"
"pld [r1, #128]\n" // preload data caches for both buffers 128 bytes ahead of time
"pld [r2, #128]\n"
"ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements of buffer 1
"ldmia r2!, {r7,r8,r9,r10}\n" // corresponding elements from buffer 2
"cmp r3, r7\n" // compare all elements for diff
"bne end2\n"
"cmp r4, r8\n"
"bne end2\n"
"cmp r5, r9\n"
"bne end2\n"
"cmp r6, r10\n"
"bne end2\n"
// unroll once to process 8x32bits = 32 bytes = L1 data cache line size of Raspberry Pi Zero in one iteration
"ldmia r1!, {r3,r4,r5,r6}\n"
"ldmia r2!, {r7,r8,r9,r10}\n"
"cmp r3, r7\n"
"bne end2\n"
"cmp r4, r8\n"
"bne end2\n"
"cmp r5, r9\n"
"bne end2\n"
"cmp r6, r10\n"
"bne end2\n"
"cmp r0, r1\n" // test loop end condition
"bne start2\n"
"end2:\n"
"mov %[result], r1\n\t"
: [result]"=r"(endPtr)
: [a]"r"(a), [b]"r"(b)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);
return endPtr-a;
}
#define BCM2835_TIMER_BASE 0x3000
int main()
{
int mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
if (mem_fd < 0) return -1;
void *bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
if (bcm2835 == MAP_FAILED) return -1;
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
for(int i = 0; i < 320*480*2; ++i)
if (tick() == 0) a[i] = 0; else a[i] = 0xEE;
for(int i = 0; i < 320*480*2; ++i)
if (tick() == 0) b[i] = 0; else b[i] = 0xEE;
uint64_t t0 = tick();
int d = diff(a, b);
// diff is coarse, find the exact position outside hot loop
if (d != 307200)
{
d -= 32;
while(a[d] == b[d]) ++d;
}
uint64_t t1 = tick();
double clocks = (double)(t1-t0)*1000.0;
int bytes = sizeof(a);
double clocksPerByte = clocks / bytes;
printf("bytes diffed: %d, diff at pos: %d, usecs taken: %llu, %f clocks/byte\n", bytes, d, t1-t0, clocksPerByte);
}
// g++ -I/opt/vc/include -L/opt/vc/lib -lbcm_host -O3 -o test test.cpp
// sudo ./test
@juj
Copy link
Author

juj commented Jul 5, 2018

bytes diffed: 307200, diff at pos: 307200, usecs taken: 1466, 4.772135 clocks/byte

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment