Skip to content

Instantly share code, notes, and snippets.

@matthijskooijman
Created March 28, 2020 07:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthijskooijman/e636e1f962125e58a80c583260a96c95 to your computer and use it in GitHub Desktop.
Save matthijskooijman/e636e1f962125e58a80c583260a96c95 to your computer and use it in GitHub Desktop.
// Put this function in RAM to prevent any unpredictable latency from
// loading instructions from flash.
byte BB_SPITransfer (byte c) __attribute__((section(".ramfunc")));
byte BB_SPITransfer (byte c)
{
// Number of cycles between writes to SCK (excluding one SCK write
// but including the other). These counts do not include the delay
// loops. The movs before the delay loop is also not counted,
// since it compensates for the missing cycle on the last branch
// in the delay loop (when it is *not* taken). Note that the PORT
// I/O registers are on the "Single Cycle IOBUS" on the SAMD21, so
// they can be accessed in 1 cycle rather than 2 for other
// peripherals:
// https://microchipdeveloper.com/32arm:samd21-iobus-overview
// Cycle counts come from the Instruction Set Summary in the Cortex
// M0+ Technical Reference Manual:
// http://infocenter.arm.com/help/topic/com.arm.doc.ddi0484c/CHDCICDF.html
// For a reference on the assembler instructions used below, see the
// Instruction Set chapter in the Cortex M0+ Devices Generic User
// Guide:
// http://infocenter.arm.com/help/topic/com.arm.doc.dui0662b/BABIHJGA.html
const unsigned LOOP_OVERHEAD = 9; // From SCK change to SCK change
const unsigned CYCLES_PER_LOOP = 4; // Number of cycles in the delay loop
// TODO: In theory this should go up to about 1.8Mhz (2x(9+4)=26
// cycles per loop, so 48Mhz / 26 = 1.8Mhz), and be perfectly
// balanced, but in practice I see about 1.2Mhz with the clock
// slightly out of balance. Maybe the PORT accesses are not
// 1-cycle after all?
// Make sure that the loop counts never become less than 1
const unsigned MIN_CYCLES = CYCLES_PER_LOOP + LOOP_OVERHEAD;
unsigned delay_cycles = max(isp_delay, MIN_CYCLES);
// Calculate the number of delay loop counts
// TODO: Maybe do this calculation in setIspSpeed already?
unsigned delay_loop_count = (delay_cycles - LOOP_OVERHEAD) / CYCLES_PER_LOOP;
// Passing the offsets within the PORT register as literals allows
// encoding these offsets efficiently in the ldr/str instructions as
// long as they are <= 124 (limitation of ldr/str instruction). This
// is more efficient than loading the absolute values of all these.
const unsigned MOSI_CLR_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MOSI_PORT].OUTCLR.reg) - (char*)(PORT) );
const unsigned MOSI_SET_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MOSI_PORT].OUTSET.reg) - (char*)(PORT) );
const unsigned SCK_CLR_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_SCK_PORT].OUTCLR.reg) - (char*)(PORT) );
const unsigned SCK_SET_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_SCK_PORT].OUTSET.reg) - (char*)(PORT) );
const unsigned MISO_IN_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MISO_PORT].IN.reg) - (char*)(PORT) );
unsigned tmp;
asm volatile (
// Use the unified ARM/Thumb syntax, which seems to be more
// universally used and corresponds to what avr-objdump outputs
// See https://sourceware.org/binutils/docs/as/ARM_002dInstruction_002dSet.html
".syntax unified\n\t"
// Shift c left so that bit 8 is now at bit 31 (which can be
// easily tested when shifting it out).
"lsls %[c], %[c], #24\n\t"
// Start of 8-bit loop
"0:\n\t"
// Shift c by one, then check the carry bit shifted out
"lsls %[c], %[c], #1 /* 1 cycle */\n\t"
"bcs set_mosi%= /* 2 if taken, 1 otherwise */\n\t"
"str %[mosi_bit_mask], [%[port], %[mosi_clr]] /* 1 cycle */\n\t"
"b done_mosi%= /* 2 cycles */\n\t"
"set_mosi%=:\n\t"
// To balance cycles for both branches
"nop /* 1 cycle */\n\t"
"str %[mosi_bit_mask], [%[port], %[mosi_set]] /* 1 cycle */\n\t"
"done_mosi%=:\n\t"
// Then, delay for count * 4 cycles
"movs %[tmp], %[delay_loop_count] /* 1 cycle */\n\t"
"1:\n\t"
" subs %[tmp], %[tmp], #1 /* 1 cycle */\n\t"
// TODO: This cmp can be removed (subs already sets flags), but
// then we have 3 cycles per loop, which is of course more work to
// calculate the cycle count...
" cmp %[tmp], #0 /* 1 cycle */\n\t"
"bne 1b /* 2 if taken, 1 otherwise */\n\t"
// Set SCK
"str %[sck_bit_mask], [%[port], %[sck_set]] /* 1 cycle */\n\t"
// Read MISO
"ldr %[tmp], [%[port], %[miso_in]] /* 1 cycle */\n\t"
// Move bit read to LSB and or into c
"lsls %[tmp], %[miso_bit_inv] /* 1 cycle */\n\t"
"lsrs %[tmp], #31 /* 1 cycle */\n\t"
"orrs %[c], %[tmp] /* 1 cycle */\n\t"
// Then, delay for count * 4 cycles
"movs %[tmp], %[delay_loop_count] /* 1 cycle */\n\t"
"2:\n\t"
" subs %[tmp], %[tmp], #1 /* 1 cycle */\n\t"
" bne 2b /* 2 if taken, 1 otherwise */\n\t"
// Add nops to balance the number of cycles from SCK to SCK
"nop\n\tnop\n\tnop\n\tnop /* 4 cycles */\n\t"
// Clear SCK
"str %[sck_bit_mask], [%[port], %[sck_clr]] /* 1 cycle */\n\t"
// Loop
"subs %[loop], %[loop], #1 /* 1 cycle */\n\t"
"bne 0b /* 2 if taken, 1 otherwise */\n\t"
// Revert to the default "divided" syntax, which is apparently
// generated by gcc (without this, the code generated below will
// break).
".syntax divided\n\t"
// Below, map C-level variables and values into assembler
// registers and immediate values. See
// http://www.ethernut.de/en/documents/arm-inline-asm.html
: // Outputs:
[tmp] "+l" (tmp),
// This puts c in a 32-bit register for us to play with, but
// the compiler takes care of truncating it to 8-bits
// afterwards (because c is declared as byte).
[c] "+l" (c)
: // Inputs:
[loop] "l" (8),
[port] "l" (PORT),
[sck_clr] "M" (SCK_CLR_OFFSET),
[sck_set] "M" (SCK_SET_OFFSET),
[sck_bit_mask] "l" (1 << BB_SCK_BIT),
[mosi_clr] "M" (MOSI_CLR_OFFSET),
[mosi_set] "M" (MOSI_SET_OFFSET),
[mosi_bit_mask] "l" (1UL << BB_MOSI_BIT),
[miso_in] "M" (MISO_IN_OFFSET),
[miso_bit_inv] "N" (31 - BB_MISO_BIT),
[delay_loop_count] "l" (delay_loop_count)
: // Clobbers:
"cc"
);
// TODO: Static assert that all offsets are <= 124
// since gcc has no operand class for this particular limitation).
return c;
} // end of BB_SPITransfer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment