mvduin/bits-are-hard-okay.cc

## bits-are-hard-okay.cc
// arm-linux-gnueabihf-gcc 6.3.0 at -O2 ..versus.. bitfields

// some struct definition for pin config registers:
struct Pad {
	enum pull_t { pull_down, no_pull, pull_up };
	enum slew_t { fast, slow };
	enum rx_t { rx_dis, rx_en };

	uint   mode	: 3;
	pull_t pull	: 2;
	rx_t   rx	: 1;
	slew_t slew	: 1;

	// one of the wrappers
	static Pad io( uint mode, pull_t pull, slew_t slew = fast ) {
		return { mode, pull, rx_en, slew };
	}
	// (note: declaring it constexpr makes some cases below even worse)
};
static_assert( sizeof(Pad) == 4 );


void foo( Pad *x ) {
	Pad tmp = Pad::io( 7, Pad::pull_down );
	x[0] = tmp;
	x[1] = tmp;
}
// disassembly:
//	movs	r3, #7		// tmp = 7
//	bfc	r3, #3, #2	// tmp &= ~0x18
//	orr	r3, r3, #32	// tmp |= 0x20
//	bfc	r3, #6, #1	// tmp &= ~0x40
//	str	r3, [r0]	// x[0] = tmp
//	str	r3, [r0, #4]	// x[1] = tmp
//
// Reminder: this is -O2, not -O0 ... go home gcc, you're drunk


// okay, maybe that (completely trivial) wrapper function is hard for
// gcc to optimize?  (when it's drunk anyway...)
//
// Let's try directly using aggregate initialization instead!
//
void foo( Pad *x ) {
	Pad tmp { 7, Pad::pull_down, Pad::rx_en, Pad::fast };
	x[0] = tmp;
	x[1] = tmp;
}
//	ldrb	r3, [r0]	// tmp = x[0].byte[0]
//	and	r3, r3, #192	// tmp &= 0xc0
//	orr	r3, r3, #39	// tmp |= 0x27
//	bfc	r3, #6, #1	// tmp &= ~0x40
//	strb	r3, [r0]	// x[0].byte[0] = tmp
//	ldrb	r3, [r0, #4]	// tmp = x[1].byte[0]
//	and	r3, r3, #192	// tmp &= 0xc0
//	orr	r3, r3, #39	// tmp |= 0x27
//	bfc	r3, #6, #1	// tmp &= ~0x40
//	strb	r3, [r0, #4]	// x[1].byte[0] = tmp
//
// ... I don't even...
//
// oddly it now suddenly seems to understand that 7 | 32 == 39,
// although combining the 'and' and 'bfc' was still to hard for it...
// but why the HELL is it performing read-modify-update on the
// lowest byte when I in fact assigned an entire 4-byte value (as
// confirmed by the static_assert).
//
// in case you're wondering: no, adding an explicit padding field to
// ensure all 32 bits are covered does not help, it makes it worse:
//
//	ldrb	r3, [r0]	// tmp = x[0].byte[0]
//	and	r3, r3, #192	// tmp &= 0xc0
//	orr	r3, r3, #39	// tmp |= 0x27
//	bfc	r3, #6, #1	// tmp &= ~0x40
//	strb	r3, [r0]	// x[0].byte[0] = tmp
//	ldr	r3, [r0]	// tmp = x[0]
//	bfc	r3, #7, #25	// tmp &= 0x7f
//	str	r3, [r0]	// x[0] = tmp
//	(...repeat for x[1]...)
//
//
// My intuition is that it just generates awful code for copying this
// struct in general was also not true, e.g.:
//
void copy( Pad *x, Pad tmp ) {
	x[0] = tmp;
	x[1] = tmp;
}
// disassembly:
//	str	r1, [r0]
//	str	r1, [r0, #4]
//
// which is not perfect (see below) but at least sane.


// Finally, for comparison, this what clang produces for foo:
//	movs	r1, #39
//	strd	r1, r1, [r0]
//
// and for copy:
//	strd	r1, r1, [r0]
	// arm-linux-gnueabihf-gcc 6.3.0 at -O2 ..versus.. bitfields

	// some struct definition for pin config registers:
	struct Pad {
	enum pull_t { pull_down, no_pull, pull_up };
	enum slew_t { fast, slow };
	enum rx_t { rx_dis, rx_en };

	uint mode : 3;
	pull_t pull : 2;
	rx_t rx : 1;
	slew_t slew : 1;

	// one of the wrappers
	static Pad io( uint mode, pull_t pull, slew_t slew = fast ) {
	return { mode, pull, rx_en, slew };
	}
	// (note: declaring it constexpr makes some cases below even worse)
	};
	static_assert( sizeof(Pad) == 4 );


	void foo( Pad *x ) {
	Pad tmp = Pad::io( 7, Pad::pull_down );
	x[0] = tmp;
	x[1] = tmp;
	}
	// disassembly:
	// movs r3, #7 // tmp = 7
	// bfc r3, #3, #2 // tmp &= ~0x18
	// orr r3, r3, #32 // tmp \|= 0x20
	// bfc r3, #6, #1 // tmp &= ~0x40
	// str r3, [r0] // x[0] = tmp
	// str r3, [r0, #4] // x[1] = tmp
	//
	// Reminder: this is -O2, not -O0 ... go home gcc, you're drunk


	// okay, maybe that (completely trivial) wrapper function is hard for
	// gcc to optimize? (when it's drunk anyway...)
	//
	// Let's try directly using aggregate initialization instead!
	//
	void foo( Pad *x ) {
	Pad tmp { 7, Pad::pull_down, Pad::rx_en, Pad::fast };
	x[0] = tmp;
	x[1] = tmp;
	}
	// ldrb r3, [r0] // tmp = x[0].byte[0]
	// and r3, r3, #192 // tmp &= 0xc0
	// orr r3, r3, #39 // tmp \|= 0x27
	// bfc r3, #6, #1 // tmp &= ~0x40
	// strb r3, [r0] // x[0].byte[0] = tmp
	// ldrb r3, [r0, #4] // tmp = x[1].byte[0]
	// and r3, r3, #192 // tmp &= 0xc0
	// orr r3, r3, #39 // tmp \|= 0x27
	// bfc r3, #6, #1 // tmp &= ~0x40
	// strb r3, [r0, #4] // x[1].byte[0] = tmp
	//
	// ... I don't even...
	//
	// oddly it now suddenly seems to understand that 7 \| 32 == 39,
	// although combining the 'and' and 'bfc' was still to hard for it...
	// but why the HELL is it performing read-modify-update on the
	// lowest byte when I in fact assigned an entire 4-byte value (as
	// confirmed by the static_assert).
	//
	// in case you're wondering: no, adding an explicit padding field to
	// ensure all 32 bits are covered does not help, it makes it worse:
	//
	// ldrb r3, [r0] // tmp = x[0].byte[0]
	// and r3, r3, #192 // tmp &= 0xc0
	// orr r3, r3, #39 // tmp \|= 0x27
	// bfc r3, #6, #1 // tmp &= ~0x40
	// strb r3, [r0] // x[0].byte[0] = tmp
	// ldr r3, [r0] // tmp = x[0]
	// bfc r3, #7, #25 // tmp &= 0x7f
	// str r3, [r0] // x[0] = tmp
	// (...repeat for x[1]...)
	//
	//
	// My intuition is that it just generates awful code for copying this
	// struct in general was also not true, e.g.:
	//
	void copy( Pad *x, Pad tmp ) {
	x[0] = tmp;
	x[1] = tmp;
	}
	// disassembly:
	// str r1, [r0]
	// str r1, [r0, #4]
	//
	// which is not perfect (see below) but at least sane.


	// Finally, for comparison, this what clang produces for foo:
	// movs r1, #39
	// strd r1, r1, [r0]
	//
	// and for copy:
	// strd r1, r1, [r0]