matthijskooijman/arm_asm_spi.cpp

## arm_asm_spi.cpp
  // Put this function in RAM to prevent any unpredictable latency from
  // loading instructions from flash.
  byte BB_SPITransfer (byte c) __attribute__((section(".ramfunc")));
  byte BB_SPITransfer (byte c)
    {
    // Number of cycles between writes to SCK (excluding one SCK write
    // but including the other). These counts do not include the delay
    // loops. The movs before the delay loop is also not counted,
    // since it compensates for the missing cycle on the last branch
    // in the delay loop (when it is *not* taken).  Note that the PORT
    // I/O registers are on the "Single Cycle IOBUS" on the SAMD21, so
    // they can be accessed in 1 cycle rather than 2 for other
    // peripherals:
    // https://microchipdeveloper.com/32arm:samd21-iobus-overview
    // Cycle counts come from the Instruction Set Summary in the Cortex
    // M0+ Technical Reference Manual:
    // http://infocenter.arm.com/help/topic/com.arm.doc.ddi0484c/CHDCICDF.html
    // For a reference on the assembler instructions used below, see the
    // Instruction Set chapter in the Cortex M0+ Devices Generic User
    // Guide:
    // http://infocenter.arm.com/help/topic/com.arm.doc.dui0662b/BABIHJGA.html
    const unsigned LOOP_OVERHEAD = 9;  // From SCK change to SCK change
    const unsigned CYCLES_PER_LOOP = 4; // Number of cycles in the delay loop

    // TODO: In theory this should go up to about 1.8Mhz (2x(9+4)=26
    // cycles per loop, so 48Mhz / 26 = 1.8Mhz), and be perfectly
    // balanced, but in practice I see about 1.2Mhz with the clock
    // slightly out of balance. Maybe the PORT accesses are not
    // 1-cycle after all?

    // Make sure that the loop counts never become less than 1
    const unsigned MIN_CYCLES = CYCLES_PER_LOOP + LOOP_OVERHEAD;
    unsigned delay_cycles = max(isp_delay, MIN_CYCLES);

    // Calculate the number of delay loop counts
    // TODO: Maybe do this calculation in setIspSpeed already?
    unsigned delay_loop_count = (delay_cycles - LOOP_OVERHEAD) / CYCLES_PER_LOOP;

    // Passing the offsets within the PORT register as literals allows
    // encoding these offsets efficiently in the ldr/str instructions as
    // long as they are <= 124 (limitation of ldr/str instruction). This
    // is more efficient than loading the absolute values of all these.
    const unsigned MOSI_CLR_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MOSI_PORT].OUTCLR.reg) - (char*)(PORT) );
    const unsigned MOSI_SET_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MOSI_PORT].OUTSET.reg) - (char*)(PORT) );
    const unsigned SCK_CLR_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_SCK_PORT].OUTCLR.reg) - (char*)(PORT) );
    const unsigned SCK_SET_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_SCK_PORT].OUTSET.reg) - (char*)(PORT) );
    const unsigned MISO_IN_OFFSET = (unsigned)( (char*)(&PORT->Group[BB_MISO_PORT].IN.reg) - (char*)(PORT) );

    unsigned tmp;
    asm volatile (
      // Use the unified ARM/Thumb syntax, which seems to be more
      // universally used and corresponds to what avr-objdump outputs
      // See https://sourceware.org/binutils/docs/as/ARM_002dInstruction_002dSet.html
      ".syntax unified\n\t"
      // Shift c left so that bit 8 is now at bit 31 (which can be
      // easily tested when shifting it out).
      "lsls  %[c], %[c], #24\n\t"

      // Start of 8-bit loop
      "0:\n\t"
      // Shift c by one, then check the carry bit shifted out
      "lsls  %[c], %[c], #1                          /* 1 cycle */\n\t"
      "bcs  set_mosi%=                               /* 2 if taken, 1 otherwise */\n\t"
      "str  %[mosi_bit_mask], [%[port], %[mosi_clr]] /* 1 cycle */\n\t"
      "b    done_mosi%=                              /* 2 cycles */\n\t"
      "set_mosi%=:\n\t"
      // To balance cycles for both branches
      "nop                                           /* 1 cycle */\n\t"
      "str  %[mosi_bit_mask], [%[port], %[mosi_set]] /* 1 cycle */\n\t"
      "done_mosi%=:\n\t"

      // Then, delay for count * 4 cycles
      "movs %[tmp], %[delay_loop_count]              /* 1 cycle */\n\t"
      "1:\n\t"
      " subs  %[tmp], %[tmp], #1                     /* 1 cycle */\n\t"
      // TODO: This cmp can be removed (subs already sets flags), but
      // then we have 3 cycles per loop, which is of course more work to
      // calculate the cycle count...
      " cmp  %[tmp], #0                              /* 1 cycle */\n\t"
      "bne  1b                                       /* 2 if taken, 1 otherwise */\n\t"

      //    Set SCK
      "str  %[sck_bit_mask], [%[port], %[sck_set]]   /* 1 cycle */\n\t"

      //    Read MISO
      "ldr  %[tmp], [%[port], %[miso_in]]            /* 1 cycle */\n\t"
      //    Move bit read to LSB and or into c
      "lsls %[tmp], %[miso_bit_inv]                  /* 1 cycle */\n\t"
      "lsrs  %[tmp], #31                             /* 1 cycle */\n\t"
      "orrs  %[c], %[tmp]                            /* 1 cycle */\n\t"
      // Then, delay for count * 4 cycles
      "movs %[tmp], %[delay_loop_count]              /* 1 cycle */\n\t"
      "2:\n\t"
      " subs  %[tmp], %[tmp], #1                     /* 1 cycle */\n\t"
      " bne  2b                                      /* 2 if taken, 1 otherwise */\n\t"

      // Add nops to balance the number of cycles from SCK to SCK
      "nop\n\tnop\n\tnop\n\tnop                      /* 4 cycles */\n\t"

      // Clear SCK
      "str %[sck_bit_mask], [%[port], %[sck_clr]]    /* 1 cycle */\n\t"

      // Loop
      "subs %[loop], %[loop], #1                     /* 1 cycle */\n\t"
      "bne  0b                                       /* 2 if taken, 1 otherwise */\n\t"
      // Revert to the default "divided" syntax, which is apparently
      // generated by gcc (without this, the code generated below will
      // break).
      ".syntax divided\n\t"
      // Below, map C-level variables and values into assembler
      // registers and immediate values. See
      // http://www.ethernut.de/en/documents/arm-inline-asm.html
      : // Outputs:
        [tmp] "+l" (tmp),
        // This puts c in a 32-bit register for us to play with, but
        // the compiler takes care of truncating it to 8-bits
        // afterwards (because c is declared as byte).
        [c] "+l" (c)
      : // Inputs:
        [loop] "l" (8),
        [port] "l" (PORT),
        [sck_clr] "M" (SCK_CLR_OFFSET),
        [sck_set] "M" (SCK_SET_OFFSET),
        [sck_bit_mask] "l" (1 << BB_SCK_BIT),
        [mosi_clr] "M" (MOSI_CLR_OFFSET),
        [mosi_set] "M" (MOSI_SET_OFFSET),
        [mosi_bit_mask] "l" (1UL << BB_MOSI_BIT),
        [miso_in] "M" (MISO_IN_OFFSET),
        [miso_bit_inv] "N" (31 - BB_MISO_BIT),
        [delay_loop_count] "l" (delay_loop_count)
      : // Clobbers:
        "cc"
    );
    // TODO: Static assert that all offsets are <= 124
    // since gcc has no operand class for this particular limitation).

    return c;
    }  // end of BB_SPITransfer
	// Put this function in RAM to prevent any unpredictable latency from
	// loading instructions from flash.
	byte BB_SPITransfer (byte c) __attribute__((section(".ramfunc")));
	byte BB_SPITransfer (byte c)
	{
	// Number of cycles between writes to SCK (excluding one SCK write
	// but including the other). These counts do not include the delay
	// loops. The movs before the delay loop is also not counted,
	// since it compensates for the missing cycle on the last branch
	// in the delay loop (when it is not taken). Note that the PORT
	// I/O registers are on the "Single Cycle IOBUS" on the SAMD21, so
	// they can be accessed in 1 cycle rather than 2 for other
	// peripherals:
	// https://microchipdeveloper.com/32arm:samd21-iobus-overview
	// Cycle counts come from the Instruction Set Summary in the Cortex
	// M0+ Technical Reference Manual:
	// http://infocenter.arm.com/help/topic/com.arm.doc.ddi0484c/CHDCICDF.html
	// For a reference on the assembler instructions used below, see the
	// Instruction Set chapter in the Cortex M0+ Devices Generic User
	// Guide:
	// http://infocenter.arm.com/help/topic/com.arm.doc.dui0662b/BABIHJGA.html
	const unsigned LOOP_OVERHEAD = 9; // From SCK change to SCK change
	const unsigned CYCLES_PER_LOOP = 4; // Number of cycles in the delay loop

	// TODO: In theory this should go up to about 1.8Mhz (2x(9+4)=26
	// cycles per loop, so 48Mhz / 26 = 1.8Mhz), and be perfectly
	// balanced, but in practice I see about 1.2Mhz with the clock
	// slightly out of balance. Maybe the PORT accesses are not
	// 1-cycle after all?

	// Make sure that the loop counts never become less than 1
	const unsigned MIN_CYCLES = CYCLES_PER_LOOP + LOOP_OVERHEAD;
	unsigned delay_cycles = max(isp_delay, MIN_CYCLES);

	// Calculate the number of delay loop counts
	// TODO: Maybe do this calculation in setIspSpeed already?
	unsigned delay_loop_count = (delay_cycles - LOOP_OVERHEAD) / CYCLES_PER_LOOP;

	// Passing the offsets within the PORT register as literals allows
	// encoding these offsets efficiently in the ldr/str instructions as
	// long as they are <= 124 (limitation of ldr/str instruction). This
	// is more efficient than loading the absolute values of all these.
	const unsigned MOSI_CLR_OFFSET = (unsigned)( (char)(&PORT->Group[BB_MOSI_PORT].OUTCLR.reg) - (char)(PORT) );
	const unsigned MOSI_SET_OFFSET = (unsigned)( (char)(&PORT->Group[BB_MOSI_PORT].OUTSET.reg) - (char)(PORT) );
	const unsigned SCK_CLR_OFFSET = (unsigned)( (char)(&PORT->Group[BB_SCK_PORT].OUTCLR.reg) - (char)(PORT) );
	const unsigned SCK_SET_OFFSET = (unsigned)( (char)(&PORT->Group[BB_SCK_PORT].OUTSET.reg) - (char)(PORT) );
	const unsigned MISO_IN_OFFSET = (unsigned)( (char)(&PORT->Group[BB_MISO_PORT].IN.reg) - (char)(PORT) );

	unsigned tmp;
	asm volatile (
	// Use the unified ARM/Thumb syntax, which seems to be more
	// universally used and corresponds to what avr-objdump outputs
	// See https://sourceware.org/binutils/docs/as/ARM_002dInstruction_002dSet.html
	".syntax unified\n\t"
	// Shift c left so that bit 8 is now at bit 31 (which can be
	// easily tested when shifting it out).
	"lsls %[c], %[c], #24\n\t"

	// Start of 8-bit loop
	"0:\n\t"
	// Shift c by one, then check the carry bit shifted out
	"lsls %[c], %[c], #1 /* 1 cycle */\n\t"
	"bcs set_mosi%= /* 2 if taken, 1 otherwise */\n\t"
	"str %[mosi_bit_mask], [%[port], %[mosi_clr]] /* 1 cycle */\n\t"
	"b done_mosi%= /* 2 cycles */\n\t"
	"set_mosi%=:\n\t"
	// To balance cycles for both branches
	"nop /* 1 cycle */\n\t"
	"str %[mosi_bit_mask], [%[port], %[mosi_set]] /* 1 cycle */\n\t"
	"done_mosi%=:\n\t"

	// Then, delay for count * 4 cycles
	"movs %[tmp], %[delay_loop_count] /* 1 cycle */\n\t"
	"1:\n\t"
	" subs %[tmp], %[tmp], #1 /* 1 cycle */\n\t"
	// TODO: This cmp can be removed (subs already sets flags), but
	// then we have 3 cycles per loop, which is of course more work to
	// calculate the cycle count...
	" cmp %[tmp], #0 /* 1 cycle */\n\t"
	"bne 1b /* 2 if taken, 1 otherwise */\n\t"

	// Set SCK
	"str %[sck_bit_mask], [%[port], %[sck_set]] /* 1 cycle */\n\t"

	// Read MISO
	"ldr %[tmp], [%[port], %[miso_in]] /* 1 cycle */\n\t"
	// Move bit read to LSB and or into c
	"lsls %[tmp], %[miso_bit_inv] /* 1 cycle */\n\t"
	"lsrs %[tmp], #31 /* 1 cycle */\n\t"
	"orrs %[c], %[tmp] /* 1 cycle */\n\t"
	// Then, delay for count * 4 cycles
	"movs %[tmp], %[delay_loop_count] /* 1 cycle */\n\t"
	"2:\n\t"
	" subs %[tmp], %[tmp], #1 /* 1 cycle */\n\t"
	" bne 2b /* 2 if taken, 1 otherwise */\n\t"

	// Add nops to balance the number of cycles from SCK to SCK
	"nop\n\tnop\n\tnop\n\tnop /* 4 cycles */\n\t"

	// Clear SCK
	"str %[sck_bit_mask], [%[port], %[sck_clr]] /* 1 cycle */\n\t"

	// Loop
	"subs %[loop], %[loop], #1 /* 1 cycle */\n\t"
	"bne 0b /* 2 if taken, 1 otherwise */\n\t"
	// Revert to the default "divided" syntax, which is apparently
	// generated by gcc (without this, the code generated below will
	// break).
	".syntax divided\n\t"
	// Below, map C-level variables and values into assembler
	// registers and immediate values. See
	// http://www.ethernut.de/en/documents/arm-inline-asm.html
	: // Outputs:
	[tmp] "+l" (tmp),
	// This puts c in a 32-bit register for us to play with, but
	// the compiler takes care of truncating it to 8-bits
	// afterwards (because c is declared as byte).
	[c] "+l" (c)
	: // Inputs:
	[loop] "l" (8),
	[port] "l" (PORT),
	[sck_clr] "M" (SCK_CLR_OFFSET),
	[sck_set] "M" (SCK_SET_OFFSET),
	[sck_bit_mask] "l" (1 << BB_SCK_BIT),
	[mosi_clr] "M" (MOSI_CLR_OFFSET),
	[mosi_set] "M" (MOSI_SET_OFFSET),
	[mosi_bit_mask] "l" (1UL << BB_MOSI_BIT),
	[miso_in] "M" (MISO_IN_OFFSET),
	[miso_bit_inv] "N" (31 - BB_MISO_BIT),
	[delay_loop_count] "l" (delay_loop_count)
	: // Clobbers:
	"cc"
	);
	// TODO: Static assert that all offsets are <= 124
	// since gcc has no operand class for this particular limitation).

	return c;
	} // end of BB_SPITransfer