hcs64/ucode.asm

## ucode.asm
// RSP ucode for converting 2bpp NES to 4bpp and 8bpp N64

// TODO sprite priority

align(8)
scope Ucode: {

InitialDMEM:
pushvar base
base 0x0000
// Unfortunately it isn't possible to use constants for vector register names(?)

// v31
Zeroes:
  fill 8*2,0

// v10
BitsOfBytes:
  dh 0x0101, 0x0202, 0x0404, 0x0808
  dh 0x1010, 0x2020, 0x4040, 0x8080

macro select_bits(src) {
// select bits
  vand v7,{src},v10[e15]
  vand v6,{src},v10[e14]
  vand v5,{src},v10[e13]
  vand v4,{src},v10[e12]
  vand v3,{src},v10[e11]
  vand v2,{src},v10[e10]
  vand v1,{src},v10[e9]
  vand v0,{src},v10[e8]
}

// 4bpp
// Pseudo shifts (and adds) to put bits in 12,11 and 8,7 (for sfv)
// v8
ShiftMux0:
  dh  1<<(16+( 7- 8)) // -16
  //dh  1<<    (11- 9)  // unused, combined with mux1
  dh  0b0100'0100     // used to promote attributes (4bpp)
  dh  1<<(16+( 7-10)) // -16
  dh  1<<(16-(8-2))   // used to promote attributes (8bpp)
  //dh  1<<    (11-11)  // unused, combined with mux1
  dh  1<<(16+( 7-12)) // -16
  dh  1<<(16+(11-13)) // -16
  dh  1<<(16+( 7-14)) // -16
  dh  1<<(16+(11-15)) // -16

// v9
ShiftMux1:
  dh  1<<( 8- 0)
  dh  (1<<(12- 1))|(1<<(11-9))
  dh  1<<( 8- 2)
  dh  (1<<(12- 3))|(1<<(11-11))
  dh  1<<( 8- 4)
  dh  1<<(12- 5)
  dh  1<<( 8- 6)
  dh  1<<(12- 7)

// 8bpp
// Pseudo shifts to put bits in 9,8 and 1,0 (for sdv)
// v29
ShiftMuxSp0:
//   ....'...A ....'...B
//
//   ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
  dh  0x10000>>-( 0- 8) // >>8 | <<1
//   ....'..A. ....'..B.
//
//   ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
  dh  0x10000>>-( 8- 9) // >>1 | <<8
//   ....'.A.. ....'.B..
//
//   ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
  dh (0x10000>>-( 0-10))|(0x10000>>-( 1- 2)) // >>10 | >> 1
//   ....'A... ....'B...
//
//   ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
  dh  0x10000>>-( 8-11) // >>3 | << 6
//   ...A'.... ...B'....
//
//   ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
  dh (0x10000>>-( 0-12))|(0x10000>>-( 1- 4)) // >>12 | >> 3
//   ..A.'.... ..B.'....
//
//   ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
  dh  0x10000>>-( 8-13) // >>5 | << 4
//   .A..'.... .B..'....
//
//   ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
  dh (0x10000>>-( 0-14))|(0x10000>>-( 1- 6)) // >>14 | >> 5
//   A...'.... B...'....
//
//   ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
  dh  0x10000>>-( 8-15) // >>7 | << 2

// v30
ShiftMuxSp1:
  dh       1<< ( 1- 0) // left
  dh       1<< ( 9- 1) // left
  dh 0x10000>>-( 1- 2) // right (unused)
  dh       1<< ( 9- 3) // left
  dh 0x10000>>-( 1- 4) // right (unused)
  dh       1<< ( 9- 5) // left
  dh 0x10000>>-( 1- 6) // right (unused)
  dh       1<< ( 9- 7) // left

// v28
Masks:
  dh 0x0300, 0x0003, 0, 0
  dh 0,0,0,0

pullvar base

// IMEM
align(8)
IMEM:
pushvar base
base 0x0000

constant dmem_src(0)
constant dmem_dst(conv_src_size)

arch n64.rsp
Boot:
-
  mfc0 t0, C0_DMA_FULL
  bnez t0,-
  nop

  mtc0 r0, C0_MEM_ADDR
  la a0, Ucode.InitialDMEM
  mtc0 a0, C0_DRAM_ADDR
  lli t0, 0x1000-1
  mtc0 t0, C0_RD_LEN

-
  mfc0 t0, C0_DMA_BUSY
  bnez t0,-
  nop

  lqv v8[e0],ShiftMux0(r0)
  lqv v9[e0],ShiftMux1(r0)
  lqv v10[e0],BitsOfBytes(r0)
  lqv v28[e0],Masks(r0)
  lqv v29[e0],ShiftMuxSp0(r0)
  lqv v30[e0],ShiftMuxSp1(r0)
  lqv v31[e0],Zeroes(r0)

ResetFrame:
  la a0, conv_src_buffer & 0x7f'ffff
  la a1, conv_dst_buffer & 0x7f'ffff
  break

SkipLines:
  addi a0, conv_src_size
  addi a1, conv_dst_size
  break

ConvertLines:
-
  mfc0 t0, C0_DMA_FULL
  bnez t0,-
  nop

  lli t0, dmem_src
  mtc0 t0, C0_MEM_ADDR
  mtc0 a0, C0_DRAM_ADDR
  lli t0, conv_src_size-1
  mtc0 t0, C0_RD_LEN

-
  mfc0 t0, C0_DMA_BUSY
  bnez t0,-
  nop

  lli a2, dmem_src + src_bg_pat
  lli a3, dmem_dst + dst_bg
  lli t8, dmem_src + src_bg_atr
  lli t0, src_sp_pat - src_bg_pat

  lqv v11[e0], 0(a2) // V3 = Tile BitPlane 0,1 Row 0..7
bg_loop:
// We're doing the same operation for each of 8 8-pixel rows in v11.
// An element holds the high and low bitplane for one row.

// select bits
select_bits(v11)

// Prefetch the next tile
  lqv v11[e0], 16(a2) // V3 = Tile BitPlane 0,1 Row 0..7
  addiu a2, 16

// Column 7,6
// The elements of v0-v7 now contain each bit 0-7 of both bitplanes.
// For columns 7 and 6, we want to go from
//   AB.. .... CD.. ....
// to packed within 14-7
//   ...C A..D B... ....
// in order to be in place for sfv to write that as
//    ..C A..D B
// This involves shifting A and B right by different amounts, and
// C and D left by different amounts, and finally combining them all.
//
// Since we have each bit of a byte in its own reg
//   v7 = A... .... C... ....
//   v6 = .B.. .... .D.. ....
// we want to do these shifts:
//   (11-15) = >> 4 = .... A... .... C...
//   (12- 7) = << 5 = ...C .... .... ....
//   ( 7-14) = >> 7 = .... .... B... ....
//   ( 8- 6) = << 2 = .... ...D .... ....
// Left shifts use vm?n, a multiply, which doesn't clamp until bit 31.
// Right shifts use vm?l, which shifts the multiply result down by 16.
// Occasionally (see cols 1 & 3) we can do two shifts together if both
// bits need to be shifted in the same direction.
// Since there is only one bit in each column (they were 8 bits apart
// so only one of each pair ends up in the 14-7 window) we can combine
// with the accumulator using the vma? ops.
// We don't need to worry about the bits outside of 14-7, as long
// as they don't carry into bit 7, which isn't the case here.

  vmudl v12,v7,v8[e15]
  vmadn v12,v7,v9[e15]
  vmadl v12,v6,v8[e14]
  vmadn v12,v6,v9[e14]

// Column 5,4
  vmudl v13,v5,v8[e13]
  vmadn v13,v5,v9[e13]
  vmadl v13,v4,v8[e12]
  vmadn v13,v4,v9[e12]

// Load attribute bits
  luv v16[e0], 0(t8)

// Column 3,2
  vmudn v14,v3,v9[e11]
  vmadl v14,v2,v8[e10]
  vmadn v14,v2,v9[e10]

// Column 1,0
  vmudn v15,v1,v9[e9]
  vmadl v15,v0,v8[e8]
  vmadn v15,v0,v9[e8]

// Add attribute bits to pixels
// These are in bytes as
//   .... ..AB
// luv loads them as
//   .... ...A B... ....
// and we want them at
//   .AB. .AB. .... ....
// so they can end up, after sfv, as
//   AB.. AB..
// this is done by multiplying with 0b0100'0100,
// effectively (x<<2)|(x<<6), which is stashed in
// an otherwise unused element of ShiftMux0.
  vmudn v16,v16,v8[e9]

// Each element of v16 holds doubled attributes for
// each row, combine with each pair of pixels.
  vor v12,v12,v16[e0]
  vor v13,v13,v16[e0]
  vor v14,v14,v16[e0]
  vor v15,v15,v16[e0]

//define HAS_SFV()
if {defined HAS_SFV} {
// Store Columns 7,6
  sfv v12[e0],0(a3)
  sfv v12[e8],16(a3)
  addi a3, 1
// Store Columns 5,4
  sfv v13[e0],0(a3)
  sfv v13[e8],16(a3)
  addi a3, 1
// Store Columns 3,2
  sfv v14[e0],0(a3)
  sfv v14[e8],16(a3)
  addi a3, 1
// Store Columns 1,0
  sfv v15[e0],0(a3)
  sfv v15[e8],16(a3)
  addi a3, 1+(32-4)
} else {

macro sfv_sim(e) {
evaluate rep_i(0)
  mfc2 t1,v12[{e}]
  srl t2, t1, 7
  sb t2, 0(a3)
  addiu a3, 1

  mfc2 t1,v13[{e}]
  srl t2, t1, 7
  sb t2, 0(a3)
  addiu a3, 1

  mfc2 t1,v14[{e}]
  srl t2, t1, 7
  sb t2, 0(a3)
  addiu a3, 1

  mfc2 t1,v15[{e}]
  srl t2, t1, 7
  sb t2, 0(a3)
  addiu a3, 1

evaluate rep_i({rep_i}+1)
}

  sfv_sim(e0)
  sfv_sim(e2)
  sfv_sim(e4)
  sfv_sim(e6)
  sfv_sim(e8)
  sfv_sim(e10)
  sfv_sim(e12)
  sfv_sim(e14)
}

  addi t0, -16
  bnez t0, bg_loop
  addi t8, 8

// Adjust for fine X scroll
scope FineXBG {
constant lines_left(sp_s0)
constant left_shift(sp_s1)
constant right_shift(sp_s2)
constant leftover(sp_s3)
constant tiles_left(sp_s4)
constant tile32(sp_s5)

// Working backwards in each line, shift each tile left, combine with the
// part that was shifted out of the previous tile.
  lli a2, dmem_dst+dst_bg+(32*8-1)*4
  lli tile32, dmem_dst+dst_bg+(33*8-1)*4
  lli lines_left, 8-1
shift_line_loop:
  lbu left_shift, dmem_src+src_bg_x (lines_left)
  lw leftover, 0 (tile32)
  addi tile32, -4
  lli tiles_left, 32
  bnez left_shift,+
  sll left_shift, 2
// srlv can only do up to 31 bits, so we can't do this for X=0 (it would be useless anyway)
  j shift_tile_loop_end
  addi a2, -32*4
+
// overloading tiles_left's 32 as 32 bits here
  sub right_shift, tiles_left, left_shift
  srlv leftover, right_shift

shift_tile_loop:

// SU loads take 3 cycles, so unroll these loops 4x
evaluate rep_i(0)
while {rep_i} < 4 {
evaluate src(t0 + {rep_i})
  lw {src}, -{rep_i} * 4 (a2)
  evaluate rep_i({rep_i} + 1)
}

evaluate rep_i(0)
while {rep_i} < 4 {
evaluate src(t0 + {rep_i})
evaluate shifted(t4 + {rep_i})
  sllv {shifted}, {src}, left_shift
  or {shifted}, leftover
  srlv leftover, {src}, right_shift
  sw {shifted}, -{rep_i} * 4 (a2)

  evaluate rep_i({rep_i} + 1)
}

  addi tiles_left, -4
  bnez tiles_left, shift_tile_loop
  addi a2, -4*4
shift_tile_loop_end:

  bnez lines_left, shift_line_loop
  addi lines_left, -1
}  // end scope FineXBG

// DMA out BG
  lli t0, dmem_dst
  mtc0 t0, C0_MEM_ADDR
  mtc0 a1, C0_DRAM_ADDR
  lli t0, dst_sp-dst_bg-1
  mtc0 t0, C0_WR_LEN

  addi a1, dst_sp-dst_bg
// There's only one port to DMEM, so it's a good idea to wait for this DMA to
// finish before proceeding to the sprites.
-
  mfc0 t0, C0_DMA_BUSY
  bnez t0,-
  nop


// ##### Sprites
// Zero out the lines
  lli t0, dmem_dst+0x80
  lli t1, 256*8
-
  sqv v31[e0],0(t0)
  addi t1, -16
  bnez t1,-
  addi t0, 16


// Convert to 8bpp
  lli a2, dmem_src+src_sp_pat
  lli a3, dmem_src+src_sp_atr
  lli sp_s1, 8
  lli sp_s2, dmem_src+src_sp_x
  lli sp_s3, dmem_dst+0x80

  lqv v11[e0], 0(a2)

sprite_loop:
macro sprite_convert(_76,_54,_32,_10) {
// Column 7,6
  vmudl v13,v7,v29[e15]
  vmadn v13,v7,v30[e15]
  vand v13,v13,v28[e8]
  vmudl v14,v6,v29[e14]
  vand v14,v14,v28[e9]
  vor {_76},v13,v14[e0]

// Column 5,4
  vmudl v13,v5,v29[e13]
  vmadn v13,v5,v30[e13]
  vand v13,v13,v28[e8]
  vmudl v14,v4,v29[e12]
  vand v14,v14,v28[e9]
  vor {_54},v13,v14[e0]

// Column 3,2
  vmudl v13,v3,v29[e11]
  vmadn v13,v3,v30[e11]
  vand v13,v13,v28[e8]
  vmudl v14,v2,v29[e10]
  vand v14,v14,v28[e9]
  vor {_32},v13,v14[e0]

// Column 1,0
  vmudl v13,v1,v29[e9]
  vmadn v13,v1,v30[e9]
  vand v13,v13,v28[e8]
  vmudl v14,v0,v29[e8]
  vmadn v14,v0,v30[e8]
  vand v14,v14,v28[e9]
  vor {_10},v13,v14[e0]

// Add attributes
  vmudl v24,v24,v8[e11]
  vmudn v24,v24,v10[e8]
  vor v24,v24,v10[e12]  // 0x1010, sprite palette

  vor {_76},{_76},v24[e0]
  vor {_54},{_54},v24[e0]
  vor {_32},{_32},v24[e0]
  vor {_10},{_10},v24[e0]
}

// First line (with 2 lines we can do a full transpose)
  lpv v24[e0], 0(a3) // attributes
select_bits(v11)
// Preload next line
  lqv v11[e0], 16(a2)
sprite_convert(v16,v17,v18,v19)

// Second line
  lpv v24[e0], 8(a3) // attributes
select_bits(v11)
// Preload next line
  lqv v11[e0], 32(a2)
sprite_convert(v20,v21,v22,v23)

// So now we have
// v16: pixel 0,1 of 8 sprites on line 0
// v17: pixel 2,3
// v18: pixel 4,5
// v19: pixel 6,7
// v20: pixel 0,1 of 8 sprites on line 1
// v21: pixel 2,3, line 1
// v22: pixel 4,5, line 1
// v23: pixel 6,7, line 1

// (0.0,0.1),...
// (0.2,0.3),...
// (0.4,0.5),...
// (0.6,0.7),...
// (8.0,8.1),...
// (8.2,8.3),...
// (8.4,8.5),...
// (8.6,8.7),...

// The idea is to transpose this to
// (0.0,0.1),(0.2,0.3),(0.4,0.5),(0.6,0.7),(8.0,8.1),(8.2,8.3),(8.4,8.5),(8.6,8.7)
// ...
// v16: line 0 sprite 0, line 1 sprite 0
// v17: line 0 sprite 1, line 1 sprite 1
// v18: line 0 sprite 2, line 1 sprite 2
// v19: line 0 sprite 3, line 1 sprite 3
// v20: line 0 sprite 4, line 1 sprite 4
// v21: line 0 sprite 5, line 1 sprite 5
// v22: line 0 sprite 6, line 1 sprite 6
// v23: line 0 sprite 7, line 1 sprite 7

  lli t0, dmem_dst
// Note: I'm not 100% sure that this is the layout in DMEM.
//         v17[0],v18[1],v19[2],v20[3],v21[4],v22[5],v23[6],v16[7] -> 0x70
  stv v16[e2], 0x70(t0)
  stv v16[e4], 0x60(t0)
  stv v16[e6], 0x50(t0)
  stv v16[e8], 0x40(t0)
  stv v16[e10], 0x30(t0)
  stv v16[e12], 0x20(t0)
  stv v16[e14], 0x10(t0)

// 0x70 -> v16[1],v17[2],v18[3],v19[4],v20[5],v21[6],v22[7],v23[0]
  ltv v16[e14], 0x70(t0)
  ltv v16[e12], 0x60(t0)
  ltv v16[e10], 0x50(t0)
  ltv v16[e8], 0x40(t0)
  ltv v16[e6], 0x30(t0)
  ltv v16[e4], 0x20(t0)
  ltv v16[e2], 0x10(t0)

// Fill in each sprite
  lbu t3, 7(sp_s2)
  lbu t2, 6(sp_s2)
  lbu t1, 5(sp_s2)
  lbu t0, 4(sp_s2)

  add t3, sp_s3
  sdv v23[e0], 0(t3)
  add t2, sp_s3
  sdv v22[e0], 0(t2)
  add t1, sp_s3
  sdv v21[e0], 0(t1)
  add t0, sp_s3
  sdv v20[e0], 0(t0)

  lbu t3, 3(sp_s2)
  lbu t2, 2(sp_s2)
  lbu t1, 1(sp_s2)
  lbu t0, 0(sp_s2)

  add t3, sp_s3
  sdv v19[e0], 0(t3)
  add t2, sp_s3
  sdv v18[e0], 0(t2)
  add t1, sp_s3
  sdv v17[e0], 0(t1)
  add t0, sp_s3
  sdv v16[e0], 0(t0)

  addi sp_s3, 256
  addi sp_s2, 8

  lbu t3, 7(sp_s2)
  lbu t2, 6(sp_s2)
  lbu t1, 5(sp_s2)
  lbu t0, 4(sp_s2)

  add t3, sp_s3
  sdv v23[e8], 0(t3)
  add t2, sp_s3
  sdv v22[e8], 0(t2)
  add t1, sp_s3
  sdv v21[e8], 0(t1)
  add t0, sp_s3
  sdv v20[e8], 0(t0)

  lbu t3, 3(sp_s2)
  lbu t2, 2(sp_s2)
  lbu t1, 1(sp_s2)
  lbu t0, 0(sp_s2)

  add t3, sp_s3
  sdv v19[e8], 0(t3)
  add t2, sp_s3
  sdv v18[e8], 0(t2)
  add t1, sp_s3
  sdv v17[e8], 0(t1)
  add t0, sp_s3
  sdv v16[e8], 0(t0)

  addi sp_s3, 256
  addi sp_s2, 8

  addi a2, 16*2
  addi sp_s1, -2
  bnez sp_s1,sprite_loop
  addi a3, 8*2

// DMA out sprites
  lli t0, dmem_dst+0x80
  mtc0 t0, C0_MEM_ADDR
  mtc0 a1, C0_DRAM_ADDR
  lli t0, conv_dst_size-dst_sp-1
  mtc0 t0, C0_WR_LEN

-
  mfc0 t0, C0_DMA_BUSY
  bnez t0,-
  nop

  addi a1, conv_dst_size-dst_sp
  addi a0, conv_src_size
  break

}
pullvar base
arch n64.cpu
align(4)
	// RSP ucode for converting 2bpp NES to 4bpp and 8bpp N64

	// TODO sprite priority

	align(8)
	scope Ucode: {

	InitialDMEM:
	pushvar base
	base 0x0000
	// Unfortunately it isn't possible to use constants for vector register names(?)

	// v31
	Zeroes:
	fill 8*2,0

	// v10
	BitsOfBytes:
	dh 0x0101, 0x0202, 0x0404, 0x0808
	dh 0x1010, 0x2020, 0x4040, 0x8080

	macro select_bits(src) {
	// select bits
	vand v7,{src},v10[e15]
	vand v6,{src},v10[e14]
	vand v5,{src},v10[e13]
	vand v4,{src},v10[e12]
	vand v3,{src},v10[e11]
	vand v2,{src},v10[e10]
	vand v1,{src},v10[e9]
	vand v0,{src},v10[e8]
	}

	// 4bpp
	// Pseudo shifts (and adds) to put bits in 12,11 and 8,7 (for sfv)
	// v8
	ShiftMux0:
	dh 1<<(16+( 7- 8)) // -16
	//dh 1<< (11- 9) // unused, combined with mux1
	dh 0b0100'0100 // used to promote attributes (4bpp)
	dh 1<<(16+( 7-10)) // -16
	dh 1<<(16-(8-2)) // used to promote attributes (8bpp)
	//dh 1<< (11-11) // unused, combined with mux1
	dh 1<<(16+( 7-12)) // -16
	dh 1<<(16+(11-13)) // -16
	dh 1<<(16+( 7-14)) // -16
	dh 1<<(16+(11-15)) // -16

	// v9
	ShiftMux1:
	dh 1<<( 8- 0)
	dh (1<<(12- 1))\|(1<<(11-9))
	dh 1<<( 8- 2)
	dh (1<<(12- 3))\|(1<<(11-11))
	dh 1<<( 8- 4)
	dh 1<<(12- 5)
	dh 1<<( 8- 6)
	dh 1<<(12- 7)

	// 8bpp
	// Pseudo shifts to put bits in 9,8 and 1,0 (for sdv)
	// v29
	ShiftMuxSp0:
	// ....'...A ....'...B
	//
	// ....'.... ....'...A
	// + ....'..A. ....'..B.
	// = ....'..A. ....'..BA
	dh 0x10000>>-( 0- 8) // >>8 \| <<1
	// ....'..A. ....'..B.
	//
	// ....'...A ....'...B
	// + ....'..B. ....'....
	// = ....'..BA ....'...B
	dh 0x10000>>-( 8- 9) // >>1 \| <<8
	// ....'.A.. ....'.B..
	//
	// ....'.... ....'...A
	// + ....'..A. ....'..B.
	// = ....'..A. ....'..BA
	dh (0x10000>>-( 0-10))\|(0x10000>>-( 1- 2)) // >>10 \| >> 1
	// ....'A... ....'B...
	//
	// ....'...A ....'...B
	// + ....'..B. ....'....
	// = ....'..BA ....'...B
	dh 0x10000>>-( 8-11) // >>3 \| << 6
	// ...A'.... ...B'....
	//
	// ....'.... ....'...A
	// + ....'..A. ....'..B.
	// = ....'..A. ....'..BA
	dh (0x10000>>-( 0-12))\|(0x10000>>-( 1- 4)) // >>12 \| >> 3
	// ..A.'.... ..B.'....
	//
	// ....'...A ....'...B
	// + ....'..B. ....'....
	// = ....'..BA ....'...B
	dh 0x10000>>-( 8-13) // >>5 \| << 4
	// .A..'.... .B..'....
	//
	// ....'.... ....'...A
	// + ....'..A. ....'..B.
	// = ....'..A. ....'..BA
	dh (0x10000>>-( 0-14))\|(0x10000>>-( 1- 6)) // >>14 \| >> 5
	// A...'.... B...'....
	//
	// ....'...A ....'...B
	// + ....'..B. ....'....
	// = ....'..BA ....'...B
	dh 0x10000>>-( 8-15) // >>7 \| << 2

	// v30
	ShiftMuxSp1:
	dh 1<< ( 1- 0) // left
	dh 1<< ( 9- 1) // left
	dh 0x10000>>-( 1- 2) // right (unused)
	dh 1<< ( 9- 3) // left
	dh 0x10000>>-( 1- 4) // right (unused)
	dh 1<< ( 9- 5) // left
	dh 0x10000>>-( 1- 6) // right (unused)
	dh 1<< ( 9- 7) // left

	// v28
	Masks:
	dh 0x0300, 0x0003, 0, 0
	dh 0,0,0,0

	pullvar base

	// IMEM
	align(8)
	IMEM:
	pushvar base
	base 0x0000

	constant dmem_src(0)
	constant dmem_dst(conv_src_size)

	arch n64.rsp
	Boot:
	-
	mfc0 t0, C0_DMA_FULL
	bnez t0,-
	nop

	mtc0 r0, C0_MEM_ADDR
	la a0, Ucode.InitialDMEM
	mtc0 a0, C0_DRAM_ADDR
	lli t0, 0x1000-1
	mtc0 t0, C0_RD_LEN

	-
	mfc0 t0, C0_DMA_BUSY
	bnez t0,-
	nop

	lqv v8[e0],ShiftMux0(r0)
	lqv v9[e0],ShiftMux1(r0)
	lqv v10[e0],BitsOfBytes(r0)
	lqv v28[e0],Masks(r0)
	lqv v29[e0],ShiftMuxSp0(r0)
	lqv v30[e0],ShiftMuxSp1(r0)
	lqv v31[e0],Zeroes(r0)

	ResetFrame:
	la a0, conv_src_buffer & 0x7f'ffff
	la a1, conv_dst_buffer & 0x7f'ffff
	break

	SkipLines:
	addi a0, conv_src_size
	addi a1, conv_dst_size
	break

	ConvertLines:
	-
	mfc0 t0, C0_DMA_FULL
	bnez t0,-
	nop

	lli t0, dmem_src
	mtc0 t0, C0_MEM_ADDR
	mtc0 a0, C0_DRAM_ADDR
	lli t0, conv_src_size-1
	mtc0 t0, C0_RD_LEN

	-
	mfc0 t0, C0_DMA_BUSY
	bnez t0,-
	nop

	lli a2, dmem_src + src_bg_pat
	lli a3, dmem_dst + dst_bg
	lli t8, dmem_src + src_bg_atr
	lli t0, src_sp_pat - src_bg_pat

	lqv v11[e0], 0(a2) // V3 = Tile BitPlane 0,1 Row 0..7
	bg_loop:
	// We're doing the same operation for each of 8 8-pixel rows in v11.
	// An element holds the high and low bitplane for one row.

	// select bits
	select_bits(v11)

	// Prefetch the next tile
	lqv v11[e0], 16(a2) // V3 = Tile BitPlane 0,1 Row 0..7
	addiu a2, 16

	// Column 7,6
	// The elements of v0-v7 now contain each bit 0-7 of both bitplanes.
	// For columns 7 and 6, we want to go from
	// AB.. .... CD.. ....
	// to packed within 14-7
	// ...C A..D B... ....
	// in order to be in place for sfv to write that as
	// ..C A..D B
	// This involves shifting A and B right by different amounts, and
	// C and D left by different amounts, and finally combining them all.
	//
	// Since we have each bit of a byte in its own reg
	// v7 = A... .... C... ....
	// v6 = .B.. .... .D.. ....
	// we want to do these shifts:
	// (11-15) = >> 4 = .... A... .... C...
	// (12- 7) = << 5 = ...C .... .... ....
	// ( 7-14) = >> 7 = .... .... B... ....
	// ( 8- 6) = << 2 = .... ...D .... ....
	// Left shifts use vm?n, a multiply, which doesn't clamp until bit 31.
	// Right shifts use vm?l, which shifts the multiply result down by 16.
	// Occasionally (see cols 1 & 3) we can do two shifts together if both
	// bits need to be shifted in the same direction.
	// Since there is only one bit in each column (they were 8 bits apart
	// so only one of each pair ends up in the 14-7 window) we can combine
	// with the accumulator using the vma? ops.
	// We don't need to worry about the bits outside of 14-7, as long
	// as they don't carry into bit 7, which isn't the case here.

	vmudl v12,v7,v8[e15]
	vmadn v12,v7,v9[e15]
	vmadl v12,v6,v8[e14]
	vmadn v12,v6,v9[e14]

	// Column 5,4
	vmudl v13,v5,v8[e13]
	vmadn v13,v5,v9[e13]
	vmadl v13,v4,v8[e12]
	vmadn v13,v4,v9[e12]

	// Load attribute bits
	luv v16[e0], 0(t8)

	// Column 3,2
	vmudn v14,v3,v9[e11]
	vmadl v14,v2,v8[e10]
	vmadn v14,v2,v9[e10]

	// Column 1,0
	vmudn v15,v1,v9[e9]
	vmadl v15,v0,v8[e8]
	vmadn v15,v0,v9[e8]

	// Add attribute bits to pixels
	// These are in bytes as
	// .... ..AB
	// luv loads them as
	// .... ...A B... ....
	// and we want them at
	// .AB. .AB. .... ....
	// so they can end up, after sfv, as
	// AB.. AB..
	// this is done by multiplying with 0b0100'0100,
	// effectively (x<<2)\|(x<<6), which is stashed in
	// an otherwise unused element of ShiftMux0.
	vmudn v16,v16,v8[e9]

	// Each element of v16 holds doubled attributes for
	// each row, combine with each pair of pixels.
	vor v12,v12,v16[e0]
	vor v13,v13,v16[e0]
	vor v14,v14,v16[e0]
	vor v15,v15,v16[e0]

	//define HAS_SFV()
	if {defined HAS_SFV} {
	// Store Columns 7,6
	sfv v12[e0],0(a3)
	sfv v12[e8],16(a3)
	addi a3, 1
	// Store Columns 5,4
	sfv v13[e0],0(a3)
	sfv v13[e8],16(a3)
	addi a3, 1
	// Store Columns 3,2
	sfv v14[e0],0(a3)
	sfv v14[e8],16(a3)
	addi a3, 1
	// Store Columns 1,0
	sfv v15[e0],0(a3)
	sfv v15[e8],16(a3)
	addi a3, 1+(32-4)
	} else {

	macro sfv_sim(e) {
	evaluate rep_i(0)
	mfc2 t1,v12[{e}]
	srl t2, t1, 7
	sb t2, 0(a3)
	addiu a3, 1

	mfc2 t1,v13[{e}]
	srl t2, t1, 7
	sb t2, 0(a3)
	addiu a3, 1

	mfc2 t1,v14[{e}]
	srl t2, t1, 7
	sb t2, 0(a3)
	addiu a3, 1

	mfc2 t1,v15[{e}]
	srl t2, t1, 7
	sb t2, 0(a3)
	addiu a3, 1

	evaluate rep_i({rep_i}+1)
	}

	sfv_sim(e0)
	sfv_sim(e2)
	sfv_sim(e4)
	sfv_sim(e6)
	sfv_sim(e8)
	sfv_sim(e10)
	sfv_sim(e12)
	sfv_sim(e14)
	}

	addi t0, -16
	bnez t0, bg_loop
	addi t8, 8

	// Adjust for fine X scroll
	scope FineXBG {
	constant lines_left(sp_s0)
	constant left_shift(sp_s1)
	constant right_shift(sp_s2)
	constant leftover(sp_s3)
	constant tiles_left(sp_s4)
	constant tile32(sp_s5)

	// Working backwards in each line, shift each tile left, combine with the
	// part that was shifted out of the previous tile.
	lli a2, dmem_dst+dst_bg+(328-1)4
	lli tile32, dmem_dst+dst_bg+(338-1)4
	lli lines_left, 8-1
	shift_line_loop:
	lbu left_shift, dmem_src+src_bg_x (lines_left)
	lw leftover, 0 (tile32)
	addi tile32, -4
	lli tiles_left, 32
	bnez left_shift,+
	sll left_shift, 2
	// srlv can only do up to 31 bits, so we can't do this for X=0 (it would be useless anyway)
	j shift_tile_loop_end
	addi a2, -32*4
	+
	// overloading tiles_left's 32 as 32 bits here
	sub right_shift, tiles_left, left_shift
	srlv leftover, right_shift

	shift_tile_loop:

	// SU loads take 3 cycles, so unroll these loops 4x
	evaluate rep_i(0)
	while {rep_i} < 4 {
	evaluate src(t0 + {rep_i})
	lw {src}, -{rep_i} * 4 (a2)
	evaluate rep_i({rep_i} + 1)
	}

	evaluate rep_i(0)
	while {rep_i} < 4 {
	evaluate src(t0 + {rep_i})
	evaluate shifted(t4 + {rep_i})
	sllv {shifted}, {src}, left_shift
	or {shifted}, leftover
	srlv leftover, {src}, right_shift
	sw {shifted}, -{rep_i} * 4 (a2)

	evaluate rep_i({rep_i} + 1)
	}

	addi tiles_left, -4
	bnez tiles_left, shift_tile_loop
	addi a2, -4*4
	shift_tile_loop_end:

	bnez lines_left, shift_line_loop
	addi lines_left, -1
	} // end scope FineXBG

	// DMA out BG
	lli t0, dmem_dst
	mtc0 t0, C0_MEM_ADDR
	mtc0 a1, C0_DRAM_ADDR
	lli t0, dst_sp-dst_bg-1
	mtc0 t0, C0_WR_LEN

	addi a1, dst_sp-dst_bg
	// There's only one port to DMEM, so it's a good idea to wait for this DMA to
	// finish before proceeding to the sprites.
	-
	mfc0 t0, C0_DMA_BUSY
	bnez t0,-
	nop


	// ##### Sprites
	// Zero out the lines
	lli t0, dmem_dst+0x80
	lli t1, 256*8
	-
	sqv v31[e0],0(t0)
	addi t1, -16
	bnez t1,-
	addi t0, 16


	// Convert to 8bpp
	lli a2, dmem_src+src_sp_pat
	lli a3, dmem_src+src_sp_atr
	lli sp_s1, 8
	lli sp_s2, dmem_src+src_sp_x
	lli sp_s3, dmem_dst+0x80

	lqv v11[e0], 0(a2)

	sprite_loop:
	macro sprite_convert(_76,_54,_32,_10) {
	// Column 7,6
	vmudl v13,v7,v29[e15]
	vmadn v13,v7,v30[e15]
	vand v13,v13,v28[e8]
	vmudl v14,v6,v29[e14]
	vand v14,v14,v28[e9]
	vor {_76},v13,v14[e0]

	// Column 5,4
	vmudl v13,v5,v29[e13]
	vmadn v13,v5,v30[e13]
	vand v13,v13,v28[e8]
	vmudl v14,v4,v29[e12]
	vand v14,v14,v28[e9]
	vor {_54},v13,v14[e0]

	// Column 3,2
	vmudl v13,v3,v29[e11]
	vmadn v13,v3,v30[e11]
	vand v13,v13,v28[e8]
	vmudl v14,v2,v29[e10]
	vand v14,v14,v28[e9]
	vor {_32},v13,v14[e0]

	// Column 1,0
	vmudl v13,v1,v29[e9]
	vmadn v13,v1,v30[e9]
	vand v13,v13,v28[e8]
	vmudl v14,v0,v29[e8]
	vmadn v14,v0,v30[e8]
	vand v14,v14,v28[e9]
	vor {_10},v13,v14[e0]

	// Add attributes
	vmudl v24,v24,v8[e11]
	vmudn v24,v24,v10[e8]
	vor v24,v24,v10[e12] // 0x1010, sprite palette

	vor {_76},{_76},v24[e0]
	vor {_54},{_54},v24[e0]
	vor {_32},{_32},v24[e0]
	vor {_10},{_10},v24[e0]
	}

	// First line (with 2 lines we can do a full transpose)
	lpv v24[e0], 0(a3) // attributes
	select_bits(v11)
	// Preload next line
	lqv v11[e0], 16(a2)
	sprite_convert(v16,v17,v18,v19)

	// Second line
	lpv v24[e0], 8(a3) // attributes
	select_bits(v11)
	// Preload next line
	lqv v11[e0], 32(a2)
	sprite_convert(v20,v21,v22,v23)

	// So now we have
	// v16: pixel 0,1 of 8 sprites on line 0
	// v17: pixel 2,3
	// v18: pixel 4,5
	// v19: pixel 6,7
	// v20: pixel 0,1 of 8 sprites on line 1
	// v21: pixel 2,3, line 1
	// v22: pixel 4,5, line 1
	// v23: pixel 6,7, line 1

	// (0.0,0.1),...
	// (0.2,0.3),...
	// (0.4,0.5),...
	// (0.6,0.7),...
	// (8.0,8.1),...
	// (8.2,8.3),...
	// (8.4,8.5),...
	// (8.6,8.7),...

	// The idea is to transpose this to
	// (0.0,0.1),(0.2,0.3),(0.4,0.5),(0.6,0.7),(8.0,8.1),(8.2,8.3),(8.4,8.5),(8.6,8.7)
	// ...
	// v16: line 0 sprite 0, line 1 sprite 0
	// v17: line 0 sprite 1, line 1 sprite 1
	// v18: line 0 sprite 2, line 1 sprite 2
	// v19: line 0 sprite 3, line 1 sprite 3
	// v20: line 0 sprite 4, line 1 sprite 4
	// v21: line 0 sprite 5, line 1 sprite 5
	// v22: line 0 sprite 6, line 1 sprite 6
	// v23: line 0 sprite 7, line 1 sprite 7

	lli t0, dmem_dst
	// Note: I'm not 100% sure that this is the layout in DMEM.
	// v17[0],v18[1],v19[2],v20[3],v21[4],v22[5],v23[6],v16[7] -> 0x70
	stv v16[e2], 0x70(t0)
	stv v16[e4], 0x60(t0)
	stv v16[e6], 0x50(t0)
	stv v16[e8], 0x40(t0)
	stv v16[e10], 0x30(t0)
	stv v16[e12], 0x20(t0)
	stv v16[e14], 0x10(t0)

	// 0x70 -> v16[1],v17[2],v18[3],v19[4],v20[5],v21[6],v22[7],v23[0]
	ltv v16[e14], 0x70(t0)
	ltv v16[e12], 0x60(t0)
	ltv v16[e10], 0x50(t0)
	ltv v16[e8], 0x40(t0)
	ltv v16[e6], 0x30(t0)
	ltv v16[e4], 0x20(t0)
	ltv v16[e2], 0x10(t0)

	// Fill in each sprite
	lbu t3, 7(sp_s2)
	lbu t2, 6(sp_s2)
	lbu t1, 5(sp_s2)
	lbu t0, 4(sp_s2)

	add t3, sp_s3
	sdv v23[e0], 0(t3)
	add t2, sp_s3
	sdv v22[e0], 0(t2)
	add t1, sp_s3
	sdv v21[e0], 0(t1)
	add t0, sp_s3
	sdv v20[e0], 0(t0)

	lbu t3, 3(sp_s2)
	lbu t2, 2(sp_s2)
	lbu t1, 1(sp_s2)
	lbu t0, 0(sp_s2)

	add t3, sp_s3
	sdv v19[e0], 0(t3)
	add t2, sp_s3
	sdv v18[e0], 0(t2)
	add t1, sp_s3
	sdv v17[e0], 0(t1)
	add t0, sp_s3
	sdv v16[e0], 0(t0)

	addi sp_s3, 256
	addi sp_s2, 8

	lbu t3, 7(sp_s2)
	lbu t2, 6(sp_s2)
	lbu t1, 5(sp_s2)
	lbu t0, 4(sp_s2)

	add t3, sp_s3
	sdv v23[e8], 0(t3)
	add t2, sp_s3
	sdv v22[e8], 0(t2)
	add t1, sp_s3
	sdv v21[e8], 0(t1)
	add t0, sp_s3
	sdv v20[e8], 0(t0)

	lbu t3, 3(sp_s2)
	lbu t2, 2(sp_s2)
	lbu t1, 1(sp_s2)
	lbu t0, 0(sp_s2)

	add t3, sp_s3
	sdv v19[e8], 0(t3)
	add t2, sp_s3
	sdv v18[e8], 0(t2)
	add t1, sp_s3
	sdv v17[e8], 0(t1)
	add t0, sp_s3
	sdv v16[e8], 0(t0)

	addi sp_s3, 256
	addi sp_s2, 8

	addi a2, 16*2
	addi sp_s1, -2
	bnez sp_s1,sprite_loop
	addi a3, 8*2

	// DMA out sprites
	lli t0, dmem_dst+0x80
	mtc0 t0, C0_MEM_ADDR
	mtc0 a1, C0_DRAM_ADDR
	lli t0, conv_dst_size-dst_sp-1
	mtc0 t0, C0_WR_LEN

	-
	mfc0 t0, C0_DMA_BUSY
	bnez t0,-
	nop

	addi a1, conv_dst_size-dst_sp
	addi a0, conv_src_size
	break

	}
	pullvar base
	arch n64.cpu
	align(4)