Skip to content

Instantly share code, notes, and snippets.

@juj
Created July 8, 2018 21:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save juj/e33338d2630bdfcef8c1e35b05c7be3b to your computer and use it in GitHub Desktop.
Save juj/e33338d2630bdfcef8c1e35b05c7be3b to your computer and use it in GitHub Desktop.
Raspberry Pi Zero linear memory block diff version 2
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <syslog.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdint.h>
#include <bcm_host.h>
volatile uint64_t *systemTimerRegister = 0;
#define tick() (*systemTimerRegister)
uint8_t __attribute__((aligned(32))) a[320*480*2];
uint8_t __attribute__((aligned(32))) b[320*480*2];
int diff(uint8_t *a, uint8_t *b)
{
uint8_t *endPtr;
asm volatile(
"mov r0, %[a]\n" // pointer to buffer 1 to diff
"add r10, r0, #307200\n" // end pointer of buffer 1, loop finish condition
"mov r1, %[b]\n" // pointer to buffer 2 to diff
"start_%=:\n"
"ldmia r0!, {r2,r3,r4,r5}\n" // load 4x32-bit elements of buffer 1
"ldmia r1!, {r6,r7,r8,r9}\n" // corresponding elements from buffer 2
"pld [r0, #240]\n" // preload data caches for both buffers 128 bytes ahead of time
"cmp r2, r6\n" // compare all elements for diff
"cmpeq r3, r7\n"
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
// unroll once to process 8x32bits = 32 bytes = L1 data cache line size of Raspberry Pi Zero in one iteration
"ldmia r1!, {r6,r7,r8,r9}\n"
"ldmia r0!, {r2,r3,r4,r5}\n"
"pld [r1, #224]\n"
"cmpeq r2, r6\n"
"cmpeq r3, r7\n"
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"bne end_%=\n"
"cmp r0, r10\n" // test loop end condition
"blo start_%=\n"
"end_%=:\n"
"mov %[result], r0\n\t"
: [result]"=r"(endPtr)
: [a]"r"(a), [b]"r"(b)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);
return endPtr-a;
}
#define BCM2835_TIMER_BASE 0x3000
int main()
{
int mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
void *bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
for(int x = 0; x < 32; ++x) // 32 test cases
{
for(int i = 0; i < 320*480*2; ++i) if (tick() == 0) a[i] = 0; else a[i] = 0xEE;
for(int i = 0; i < 320*480*2; ++i) if (tick() == 0) b[i] = 0; else b[i] = 0xEE;
a[320*480*2-1-x] = 0x10; // Introduce an actual diff towards the very end
uint64_t t0 = tick();
int d = diff(a, b);
int e = d;
// diff is coarse, find the exact position outside hot loop
if (e <= 307200)
{
e -= 32;
while(a[e] == b[e] && e < 307200) ++e;
}
uint64_t t1 = tick();
double clocks = (double)(t1-t0)*1000.0;
if (clocks == 0) clocks = 1;
int bytes = sizeof(a);
double clocksPerByte = clocks / bytes;
printf("bytes diffed: %d, diff at pos: %d (exact: %d), usecs taken: %llu, %f clocks/byte, %f clocks/32b (%f MB/sec)\n", bytes, d, e, t1-t0, clocksPerByte, clocksPerByte*32.0, 1e9/clocksPerByte / 1000000.0);
}
}
// g++ -I/opt/vc/include -L/opt/vc/lib -lbcm_host -O3 -o test test.cpp
// sudo ./test
@juj
Copy link
Author

juj commented Jul 8, 2018

bytes diffed: 307200, diff at pos: 307200 (exact: 307199), usecs taken: 1446, 4.707031 clocks/byte, 150.625000 clocks/32b (212.448133 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307198), usecs taken: 1431, 4.658203 clocks/byte, 149.062500 clocks/32b (214.675052 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307197), usecs taken: 1423, 4.632161 clocks/byte, 148.229167 clocks/32b (215.881940 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307196), usecs taken: 1428, 4.648438 clocks/byte, 148.750000 clocks/32b (215.126050 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307195), usecs taken: 1424, 4.635417 clocks/byte, 148.333333 clocks/32b (215.730337 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307194), usecs taken: 1418, 4.615885 clocks/byte, 147.708333 clocks/32b (216.643159 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307193), usecs taken: 1425, 4.638672 clocks/byte, 148.437500 clocks/32b (215.578947 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307192), usecs taken: 1439, 4.684245 clocks/byte, 149.895833 clocks/32b (213.481584 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307191), usecs taken: 1420, 4.622396 clocks/byte, 147.916667 clocks/32b (216.338028 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307190), usecs taken: 1427, 4.645182 clocks/byte, 148.645833 clocks/32b (215.276804 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307189), usecs taken: 1430, 4.654948 clocks/byte, 148.958333 clocks/32b (214.825175 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307188), usecs taken: 1432, 4.661458 clocks/byte, 149.166667 clocks/32b (214.525140 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307187), usecs taken: 1431, 4.658203 clocks/byte, 149.062500 clocks/32b (214.675052 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307186), usecs taken: 1431, 4.658203 clocks/byte, 149.062500 clocks/32b (214.675052 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307185), usecs taken: 1430, 4.654948 clocks/byte, 148.958333 clocks/32b (214.825175 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307184), usecs taken: 1868, 6.080729 clocks/byte, 194.583333 clocks/32b (164.453961 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307183), usecs taken: 1433, 4.664714 clocks/byte, 149.270833 clocks/32b (214.375436 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307182), usecs taken: 1434, 4.667969 clocks/byte, 149.375000 clocks/32b (214.225941 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307181), usecs taken: 1430, 4.654948 clocks/byte, 148.958333 clocks/32b (214.825175 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307180), usecs taken: 1432, 4.661458 clocks/byte, 149.166667 clocks/32b (214.525140 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307179), usecs taken: 1433, 4.664714 clocks/byte, 149.270833 clocks/32b (214.375436 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307178), usecs taken: 1505, 4.899089 clocks/byte, 156.770833 clocks/32b (204.119601 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307177), usecs taken: 1512, 4.921875 clocks/byte, 157.500000 clocks/32b (203.174603 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307176), usecs taken: 1538, 5.006510 clocks/byte, 160.208333 clocks/32b (199.739922 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307175), usecs taken: 1922, 6.256510 clocks/byte, 200.208333 clocks/32b (159.833507 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307174), usecs taken: 1494, 4.863281 clocks/byte, 155.625000 clocks/32b (205.622490 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307173), usecs taken: 1431, 4.658203 clocks/byte, 149.062500 clocks/32b (214.675052 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307172), usecs taken: 1431, 4.658203 clocks/byte, 149.062500 clocks/32b (214.675052 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307171), usecs taken: 1431, 4.658203 clocks/byte, 149.062500 clocks/32b (214.675052 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307170), usecs taken: 1435, 4.671224 clocks/byte, 149.479167 clocks/32b (214.076655 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307169), usecs taken: 1427, 4.645182 clocks/byte, 148.645833 clocks/32b (215.276804 MB/sec)
bytes diffed: 307200, diff at pos: 307200 (exact: 307168), usecs taken: 1432, 4.661458 clocks/byte, 149.166667 clocks/32b (214.525140 MB/sec)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment