Wren6991/amo.S

## amo.S
#define OPCODE_AMO  0x2f

#define FUNCT5_AMOSWAP 0x01
#define FUNCT5_AMOADD  0x00
#define FUNCT5_AMOXOR  0x04
#define FUNCT5_AMOAND  0x0c
#define FUNCT5_AMOOR   0x08
#define FUNCT5_AMOMIN  0x10
#define FUNCT5_AMOMAX  0x14
#define FUNCT5_AMOMINU 0x18
#define FUNCT5_AMOMAXU 0x1c

.global handle_illegal_instr
handle_illegal_instr:
	// Spill all registers to stack, so we can index them
	sw sp, -120(sp)
	addi sp, sp, -128
	sw x0 , 0  (sp)
	sw x1 , 4  (sp)
	// skip sp
	sw x3 , 12 (sp)
	sw x4 , 16 (sp)
	sw x5 , 20 (sp)
	sw x6 , 24 (sp)
	sw x7 , 28 (sp)
	sw x8 , 32 (sp)
	sw x9 , 36 (sp)
	sw x10, 40 (sp)
	sw x11, 44 (sp)
	sw x12, 48 (sp)
	sw x13, 52 (sp)
	sw x14, 56 (sp)
	sw x15, 60 (sp)
	sw x16, 64 (sp)
	sw x17, 68 (sp)
	sw x18, 72 (sp)
	sw x19, 76 (sp)
	sw x20, 80 (sp)
	sw x21, 84 (sp)
	sw x22, 88 (sp)
	sw x23, 92 (sp)
	sw x24, 96 (sp)
	sw x25, 100(sp)
	sw x26, 104(sp)
	sw x27, 108(sp)
	sw x28, 112(sp)
	sw x29, 116(sp)
	sw x30, 120(sp)
	sw x31, 124(sp)

	// Get instruction in a0 without performing any unaligned accesses (LSB of
	// mepc is always 0), then check its opcode (7 LSBs) to see if it's one
	// we can emulate
	csrr a1, mepc
	lhu a0, (a1)
	lhu a2, 2(a1)
	slli a2, a2, 16
	or a0, a2, a2
	andi a1, a0, 0x3f

	li a4, OPCODE_AMO
	beq a1, a4, handle_amo
	// All matches fell through, we don't know how to handle this. Just NOP...

	// Instruction handled, we can restore all the integer state from the
	// stack. If the emulated instruction wrote to a register, it did so by
	// modifying this spilled register frame. Note we need to return to the
	// instruction *after* the one we handled, so increment mepc.
restore_integer_regs_then_mret:
bad_instr:
	csrr a0, mepc
	addi a0, a0, 4
	csrw mepc, a0
	lw x1 , 4  (sp)
	// Leave SP til last
	lw x3 , 12 (sp)
	lw x4 , 16 (sp)
	lw x5 , 20 (sp)
	lw x6 , 24 (sp)
	lw x7 , 28 (sp)
	lw x8 , 32 (sp)
	lw x9 , 36 (sp)
	lw x10, 40 (sp)
	lw x11, 44 (sp)
	lw x12, 48 (sp)
	lw x13, 52 (sp)
	lw x14, 56 (sp)
	lw x15, 60 (sp)
	lw x16, 64 (sp)
	lw x17, 68 (sp)
	lw x18, 72 (sp)
	lw x19, 76 (sp)
	lw x20, 80 (sp)
	lw x21, 84 (sp)
	lw x22, 88 (sp)
	lw x23, 92 (sp)
	lw x24, 96 (sp)
	lw x25, 100(sp)
	lw x26, 104(sp)
	lw x27, 108(sp)
	lw x28, 112(sp)
	lw x29, 116(sp)
	lw x30, 120(sp)
	lw x31, 124(sp)
	lw sp , 8  (sp)
	mret

handle_amo:
	// Get rs1, rs2 in a2, a3
	srli a2, a0, 15 - 2
	andi a2, a2, 0x1f << 2
	add a2, a2, sp
	lw a2, (a2)
	srli a3, a0, 20 - 2
	andi a3, a3, 0x1f << 2
	add a3, a3, sp
	lw a3, (a3)

	// Decode correct AMO routine or fall through if no match
	srli a1, a0, 27

	addi a1, a1, 0              - FUNCT5_AMOSWAP
	beqz a1, amoswap
	addi a1, a1, FUNCT5_AMOSWAP - FUNCT5_AMOADD
	beqz a1, amoadd
	addi a1, a1, FUNCT5_AMOADD  - FUNCT5_AMOXOR
	beqz a1, amoxor
	addi a1, a1, FUNCT5_AMOXOR  - FUNCT5_AMOAND
	beqz a1, amoand
	addi a1, a1, FUNCT5_AMOAND  - FUNCT5_AMOOR
	beqz a1, amoor
	addi a1, a1, FUNCT5_AMOOR   - FUNCT5_AMOMIN
	li a7, 0b0101 // slt true
	beqz a1, amo_minmax
	addi a1, a1, FUNCT5_AMOMIN  - FUNCT5_AMOMAX
	li a7, 0b0100 // slt false
	beqz a1, amo_minmax
	addi a1, a1, FUNCT5_AMOMAX  - FUNCT5_AMOMINU
	li a7, 0b1001 // sltu true
	beqz a1, amo_minmax
	addi a1, a1, FUNCT5_AMOMINU - FUNCT5_AMOMAXU
	li a7, 0b1000 // sltu false
	beqz a1, amo_minmax
	j bad_instr


// AMO routines.
// Data for rs1, rs2 are passed in a2, a3.
// Data for rd is returned in a4.

amoadd:
	lr.w a4, (a2)
	add a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoadd
	j amo_done

amoand:
	lr.w a4, (a2)
	and a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoand
	j amo_done

amoor:
	lr.w a4, (a2)
	or a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoor
	j amo_done

amoxor:
	lr.w a4, (a2)
	xor a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoxor
	j amo_done

// Swap based on expected value combination of slt, sltu for current and new
// value. Avoid branches in critical section, so we don't lose local progress
// guarantee.

// a7[1:0] are expected values for {sltu, slt}
// a7[3:2] are care-mask for {sltu, slt}
amo_minmax:
	lr.w a4, (a2)
	slt t0, a3, a4
	sltu t1, a3, a4
	slli t1, t1, 1
	or t0, t0, t1
	xor t0, t0, a7
	srli t1, a7, 2
	and t0, t0, t1
	seqz a5, t0
	// set a5 to a3 if comparison true, else a4
	xor a6, a3, a4
	mul a5, a5, a6
	xor a5, a5, a4
	sc.w a1, a5, (a2)
	bnez a1, amo_minmax
	j amo_done

amoswap:
	lr.w a4, (a2)
	sc.w a5, a3, (a2)
	bnez a5, amoswap
	// fall-thru to amo_done

amo_done:
	srli a3, a0, 7 - 2
	andi a3, a3, 0x1f << 2
	add a3, a3, sp
	sw a4, (a3)
	j restore_integer_regs_then_mret
	#define OPCODE_AMO 0x2f

	#define FUNCT5_AMOSWAP 0x01
	#define FUNCT5_AMOADD 0x00
	#define FUNCT5_AMOXOR 0x04
	#define FUNCT5_AMOAND 0x0c
	#define FUNCT5_AMOOR 0x08
	#define FUNCT5_AMOMIN 0x10
	#define FUNCT5_AMOMAX 0x14
	#define FUNCT5_AMOMINU 0x18
	#define FUNCT5_AMOMAXU 0x1c

	.global handle_illegal_instr
	handle_illegal_instr:
	// Spill all registers to stack, so we can index them
	sw sp, -120(sp)
	addi sp, sp, -128
	sw x0 , 0 (sp)
	sw x1 , 4 (sp)
	// skip sp
	sw x3 , 12 (sp)
	sw x4 , 16 (sp)
	sw x5 , 20 (sp)
	sw x6 , 24 (sp)
	sw x7 , 28 (sp)
	sw x8 , 32 (sp)
	sw x9 , 36 (sp)
	sw x10, 40 (sp)
	sw x11, 44 (sp)
	sw x12, 48 (sp)
	sw x13, 52 (sp)
	sw x14, 56 (sp)
	sw x15, 60 (sp)
	sw x16, 64 (sp)
	sw x17, 68 (sp)
	sw x18, 72 (sp)
	sw x19, 76 (sp)
	sw x20, 80 (sp)
	sw x21, 84 (sp)
	sw x22, 88 (sp)
	sw x23, 92 (sp)
	sw x24, 96 (sp)
	sw x25, 100(sp)
	sw x26, 104(sp)
	sw x27, 108(sp)
	sw x28, 112(sp)
	sw x29, 116(sp)
	sw x30, 120(sp)
	sw x31, 124(sp)

	// Get instruction in a0 without performing any unaligned accesses (LSB of
	// mepc is always 0), then check its opcode (7 LSBs) to see if it's one
	// we can emulate
	csrr a1, mepc
	lhu a0, (a1)
	lhu a2, 2(a1)
	slli a2, a2, 16
	or a0, a2, a2
	andi a1, a0, 0x3f

	li a4, OPCODE_AMO
	beq a1, a4, handle_amo
	// All matches fell through, we don't know how to handle this. Just NOP...

	// Instruction handled, we can restore all the integer state from the
	// stack. If the emulated instruction wrote to a register, it did so by
	// modifying this spilled register frame. Note we need to return to the
	// instruction after the one we handled, so increment mepc.
	restore_integer_regs_then_mret:
	bad_instr:
	csrr a0, mepc
	addi a0, a0, 4
	csrw mepc, a0
	lw x1 , 4 (sp)
	// Leave SP til last
	lw x3 , 12 (sp)
	lw x4 , 16 (sp)
	lw x5 , 20 (sp)
	lw x6 , 24 (sp)
	lw x7 , 28 (sp)
	lw x8 , 32 (sp)
	lw x9 , 36 (sp)
	lw x10, 40 (sp)
	lw x11, 44 (sp)
	lw x12, 48 (sp)
	lw x13, 52 (sp)
	lw x14, 56 (sp)
	lw x15, 60 (sp)
	lw x16, 64 (sp)
	lw x17, 68 (sp)
	lw x18, 72 (sp)
	lw x19, 76 (sp)
	lw x20, 80 (sp)
	lw x21, 84 (sp)
	lw x22, 88 (sp)
	lw x23, 92 (sp)
	lw x24, 96 (sp)
	lw x25, 100(sp)
	lw x26, 104(sp)
	lw x27, 108(sp)
	lw x28, 112(sp)
	lw x29, 116(sp)
	lw x30, 120(sp)
	lw x31, 124(sp)
	lw sp , 8 (sp)
	mret

	handle_amo:
	// Get rs1, rs2 in a2, a3
	srli a2, a0, 15 - 2
	andi a2, a2, 0x1f << 2
	add a2, a2, sp
	lw a2, (a2)
	srli a3, a0, 20 - 2
	andi a3, a3, 0x1f << 2
	add a3, a3, sp
	lw a3, (a3)

	// Decode correct AMO routine or fall through if no match
	srli a1, a0, 27

	addi a1, a1, 0 - FUNCT5_AMOSWAP
	beqz a1, amoswap
	addi a1, a1, FUNCT5_AMOSWAP - FUNCT5_AMOADD
	beqz a1, amoadd
	addi a1, a1, FUNCT5_AMOADD - FUNCT5_AMOXOR
	beqz a1, amoxor
	addi a1, a1, FUNCT5_AMOXOR - FUNCT5_AMOAND
	beqz a1, amoand
	addi a1, a1, FUNCT5_AMOAND - FUNCT5_AMOOR
	beqz a1, amoor
	addi a1, a1, FUNCT5_AMOOR - FUNCT5_AMOMIN
	li a7, 0b0101 // slt true
	beqz a1, amo_minmax
	addi a1, a1, FUNCT5_AMOMIN - FUNCT5_AMOMAX
	li a7, 0b0100 // slt false
	beqz a1, amo_minmax
	addi a1, a1, FUNCT5_AMOMAX - FUNCT5_AMOMINU
	li a7, 0b1001 // sltu true
	beqz a1, amo_minmax
	addi a1, a1, FUNCT5_AMOMINU - FUNCT5_AMOMAXU
	li a7, 0b1000 // sltu false
	beqz a1, amo_minmax
	j bad_instr


	// AMO routines.
	// Data for rs1, rs2 are passed in a2, a3.
	// Data for rd is returned in a4.

	amoadd:
	lr.w a4, (a2)
	add a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoadd
	j amo_done

	amoand:
	lr.w a4, (a2)
	and a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoand
	j amo_done

	amoor:
	lr.w a4, (a2)
	or a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoor
	j amo_done

	amoxor:
	lr.w a4, (a2)
	xor a5, a4, a3
	sc.w a5, a5, (a2)
	bnez a5, amoxor
	j amo_done

	// Swap based on expected value combination of slt, sltu for current and new
	// value. Avoid branches in critical section, so we don't lose local progress
	// guarantee.

	// a7[1:0] are expected values for {sltu, slt}
	// a7[3:2] are care-mask for {sltu, slt}
	amo_minmax:
	lr.w a4, (a2)
	slt t0, a3, a4
	sltu t1, a3, a4
	slli t1, t1, 1
	or t0, t0, t1
	xor t0, t0, a7
	srli t1, a7, 2
	and t0, t0, t1
	seqz a5, t0
	// set a5 to a3 if comparison true, else a4
	xor a6, a3, a4
	mul a5, a5, a6
	xor a5, a5, a4
	sc.w a1, a5, (a2)
	bnez a1, amo_minmax
	j amo_done

	amoswap:
	lr.w a4, (a2)
	sc.w a5, a3, (a2)
	bnez a5, amoswap
	// fall-thru to amo_done

	amo_done:
	srli a3, a0, 7 - 2
	andi a3, a3, 0x1f << 2
	add a3, a3, sp
	sw a4, (a3)
	j restore_integer_regs_then_mret