Skip to content

Instantly share code, notes, and snippets.

@hcs64
Created May 31, 2020 13:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hcs64/68b8cd7fa167880cd4f975c3d458c9ff to your computer and use it in GitHub Desktop.
Save hcs64/68b8cd7fa167880cd4f975c3d458c9ff to your computer and use it in GitHub Desktop.
4bpp and 8bpp conversions
// RSP ucode for converting 2bpp NES to 4bpp and 8bpp N64
// TODO sprite priority
align(8)
scope Ucode: {
InitialDMEM:
pushvar base
base 0x0000
// Unfortunately it isn't possible to use constants for vector register names(?)
// v31
Zeroes:
fill 8*2,0
// v10
BitsOfBytes:
dh 0x0101, 0x0202, 0x0404, 0x0808
dh 0x1010, 0x2020, 0x4040, 0x8080
macro select_bits(src) {
// select bits
vand v7,{src},v10[e15]
vand v6,{src},v10[e14]
vand v5,{src},v10[e13]
vand v4,{src},v10[e12]
vand v3,{src},v10[e11]
vand v2,{src},v10[e10]
vand v1,{src},v10[e9]
vand v0,{src},v10[e8]
}
// 4bpp
// Pseudo shifts (and adds) to put bits in 12,11 and 8,7 (for sfv)
// v8
ShiftMux0:
dh 1<<(16+( 7- 8)) // -16
//dh 1<< (11- 9) // unused, combined with mux1
dh 0b0100'0100 // used to promote attributes (4bpp)
dh 1<<(16+( 7-10)) // -16
dh 1<<(16-(8-2)) // used to promote attributes (8bpp)
//dh 1<< (11-11) // unused, combined with mux1
dh 1<<(16+( 7-12)) // -16
dh 1<<(16+(11-13)) // -16
dh 1<<(16+( 7-14)) // -16
dh 1<<(16+(11-15)) // -16
// v9
ShiftMux1:
dh 1<<( 8- 0)
dh (1<<(12- 1))|(1<<(11-9))
dh 1<<( 8- 2)
dh (1<<(12- 3))|(1<<(11-11))
dh 1<<( 8- 4)
dh 1<<(12- 5)
dh 1<<( 8- 6)
dh 1<<(12- 7)
// 8bpp
// Pseudo shifts to put bits in 9,8 and 1,0 (for sdv)
// v29
ShiftMuxSp0:
// ....'...A ....'...B
//
// ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
dh 0x10000>>-( 0- 8) // >>8 | <<1
// ....'..A. ....'..B.
//
// ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
dh 0x10000>>-( 8- 9) // >>1 | <<8
// ....'.A.. ....'.B..
//
// ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
dh (0x10000>>-( 0-10))|(0x10000>>-( 1- 2)) // >>10 | >> 1
// ....'A... ....'B...
//
// ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
dh 0x10000>>-( 8-11) // >>3 | << 6
// ...A'.... ...B'....
//
// ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
dh (0x10000>>-( 0-12))|(0x10000>>-( 1- 4)) // >>12 | >> 3
// ..A.'.... ..B.'....
//
// ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
dh 0x10000>>-( 8-13) // >>5 | << 4
// .A..'.... .B..'....
//
// ....'.... ....'...A
// + ....'..A. ....'..B.
// = ....'..A. ....'..BA
dh (0x10000>>-( 0-14))|(0x10000>>-( 1- 6)) // >>14 | >> 5
// A...'.... B...'....
//
// ....'...A ....'...B
// + ....'..B. ....'....
// = ....'..BA ....'...B
dh 0x10000>>-( 8-15) // >>7 | << 2
// v30
ShiftMuxSp1:
dh 1<< ( 1- 0) // left
dh 1<< ( 9- 1) // left
dh 0x10000>>-( 1- 2) // right (unused)
dh 1<< ( 9- 3) // left
dh 0x10000>>-( 1- 4) // right (unused)
dh 1<< ( 9- 5) // left
dh 0x10000>>-( 1- 6) // right (unused)
dh 1<< ( 9- 7) // left
// v28
Masks:
dh 0x0300, 0x0003, 0, 0
dh 0,0,0,0
pullvar base
// IMEM
align(8)
IMEM:
pushvar base
base 0x0000
constant dmem_src(0)
constant dmem_dst(conv_src_size)
arch n64.rsp
Boot:
-
mfc0 t0, C0_DMA_FULL
bnez t0,-
nop
mtc0 r0, C0_MEM_ADDR
la a0, Ucode.InitialDMEM
mtc0 a0, C0_DRAM_ADDR
lli t0, 0x1000-1
mtc0 t0, C0_RD_LEN
-
mfc0 t0, C0_DMA_BUSY
bnez t0,-
nop
lqv v8[e0],ShiftMux0(r0)
lqv v9[e0],ShiftMux1(r0)
lqv v10[e0],BitsOfBytes(r0)
lqv v28[e0],Masks(r0)
lqv v29[e0],ShiftMuxSp0(r0)
lqv v30[e0],ShiftMuxSp1(r0)
lqv v31[e0],Zeroes(r0)
ResetFrame:
la a0, conv_src_buffer & 0x7f'ffff
la a1, conv_dst_buffer & 0x7f'ffff
break
SkipLines:
addi a0, conv_src_size
addi a1, conv_dst_size
break
ConvertLines:
-
mfc0 t0, C0_DMA_FULL
bnez t0,-
nop
lli t0, dmem_src
mtc0 t0, C0_MEM_ADDR
mtc0 a0, C0_DRAM_ADDR
lli t0, conv_src_size-1
mtc0 t0, C0_RD_LEN
-
mfc0 t0, C0_DMA_BUSY
bnez t0,-
nop
lli a2, dmem_src + src_bg_pat
lli a3, dmem_dst + dst_bg
lli t8, dmem_src + src_bg_atr
lli t0, src_sp_pat - src_bg_pat
lqv v11[e0], 0(a2) // V3 = Tile BitPlane 0,1 Row 0..7
bg_loop:
// We're doing the same operation for each of 8 8-pixel rows in v11.
// An element holds the high and low bitplane for one row.
// select bits
select_bits(v11)
// Prefetch the next tile
lqv v11[e0], 16(a2) // V3 = Tile BitPlane 0,1 Row 0..7
addiu a2, 16
// Column 7,6
// The elements of v0-v7 now contain each bit 0-7 of both bitplanes.
// For columns 7 and 6, we want to go from
// AB.. .... CD.. ....
// to packed within 14-7
// ...C A..D B... ....
// in order to be in place for sfv to write that as
// ..C A..D B
// This involves shifting A and B right by different amounts, and
// C and D left by different amounts, and finally combining them all.
//
// Since we have each bit of a byte in its own reg
// v7 = A... .... C... ....
// v6 = .B.. .... .D.. ....
// we want to do these shifts:
// (11-15) = >> 4 = .... A... .... C...
// (12- 7) = << 5 = ...C .... .... ....
// ( 7-14) = >> 7 = .... .... B... ....
// ( 8- 6) = << 2 = .... ...D .... ....
// Left shifts use vm?n, a multiply, which doesn't clamp until bit 31.
// Right shifts use vm?l, which shifts the multiply result down by 16.
// Occasionally (see cols 1 & 3) we can do two shifts together if both
// bits need to be shifted in the same direction.
// Since there is only one bit in each column (they were 8 bits apart
// so only one of each pair ends up in the 14-7 window) we can combine
// with the accumulator using the vma? ops.
// We don't need to worry about the bits outside of 14-7, as long
// as they don't carry into bit 7, which isn't the case here.
vmudl v12,v7,v8[e15]
vmadn v12,v7,v9[e15]
vmadl v12,v6,v8[e14]
vmadn v12,v6,v9[e14]
// Column 5,4
vmudl v13,v5,v8[e13]
vmadn v13,v5,v9[e13]
vmadl v13,v4,v8[e12]
vmadn v13,v4,v9[e12]
// Load attribute bits
luv v16[e0], 0(t8)
// Column 3,2
vmudn v14,v3,v9[e11]
vmadl v14,v2,v8[e10]
vmadn v14,v2,v9[e10]
// Column 1,0
vmudn v15,v1,v9[e9]
vmadl v15,v0,v8[e8]
vmadn v15,v0,v9[e8]
// Add attribute bits to pixels
// These are in bytes as
// .... ..AB
// luv loads them as
// .... ...A B... ....
// and we want them at
// .AB. .AB. .... ....
// so they can end up, after sfv, as
// AB.. AB..
// this is done by multiplying with 0b0100'0100,
// effectively (x<<2)|(x<<6), which is stashed in
// an otherwise unused element of ShiftMux0.
vmudn v16,v16,v8[e9]
// Each element of v16 holds doubled attributes for
// each row, combine with each pair of pixels.
vor v12,v12,v16[e0]
vor v13,v13,v16[e0]
vor v14,v14,v16[e0]
vor v15,v15,v16[e0]
//define HAS_SFV()
if {defined HAS_SFV} {
// Store Columns 7,6
sfv v12[e0],0(a3)
sfv v12[e8],16(a3)
addi a3, 1
// Store Columns 5,4
sfv v13[e0],0(a3)
sfv v13[e8],16(a3)
addi a3, 1
// Store Columns 3,2
sfv v14[e0],0(a3)
sfv v14[e8],16(a3)
addi a3, 1
// Store Columns 1,0
sfv v15[e0],0(a3)
sfv v15[e8],16(a3)
addi a3, 1+(32-4)
} else {
macro sfv_sim(e) {
evaluate rep_i(0)
mfc2 t1,v12[{e}]
srl t2, t1, 7
sb t2, 0(a3)
addiu a3, 1
mfc2 t1,v13[{e}]
srl t2, t1, 7
sb t2, 0(a3)
addiu a3, 1
mfc2 t1,v14[{e}]
srl t2, t1, 7
sb t2, 0(a3)
addiu a3, 1
mfc2 t1,v15[{e}]
srl t2, t1, 7
sb t2, 0(a3)
addiu a3, 1
evaluate rep_i({rep_i}+1)
}
sfv_sim(e0)
sfv_sim(e2)
sfv_sim(e4)
sfv_sim(e6)
sfv_sim(e8)
sfv_sim(e10)
sfv_sim(e12)
sfv_sim(e14)
}
addi t0, -16
bnez t0, bg_loop
addi t8, 8
// Adjust for fine X scroll
scope FineXBG {
constant lines_left(sp_s0)
constant left_shift(sp_s1)
constant right_shift(sp_s2)
constant leftover(sp_s3)
constant tiles_left(sp_s4)
constant tile32(sp_s5)
// Working backwards in each line, shift each tile left, combine with the
// part that was shifted out of the previous tile.
lli a2, dmem_dst+dst_bg+(32*8-1)*4
lli tile32, dmem_dst+dst_bg+(33*8-1)*4
lli lines_left, 8-1
shift_line_loop:
lbu left_shift, dmem_src+src_bg_x (lines_left)
lw leftover, 0 (tile32)
addi tile32, -4
lli tiles_left, 32
bnez left_shift,+
sll left_shift, 2
// srlv can only do up to 31 bits, so we can't do this for X=0 (it would be useless anyway)
j shift_tile_loop_end
addi a2, -32*4
+
// overloading tiles_left's 32 as 32 bits here
sub right_shift, tiles_left, left_shift
srlv leftover, right_shift
shift_tile_loop:
// SU loads take 3 cycles, so unroll these loops 4x
evaluate rep_i(0)
while {rep_i} < 4 {
evaluate src(t0 + {rep_i})
lw {src}, -{rep_i} * 4 (a2)
evaluate rep_i({rep_i} + 1)
}
evaluate rep_i(0)
while {rep_i} < 4 {
evaluate src(t0 + {rep_i})
evaluate shifted(t4 + {rep_i})
sllv {shifted}, {src}, left_shift
or {shifted}, leftover
srlv leftover, {src}, right_shift
sw {shifted}, -{rep_i} * 4 (a2)
evaluate rep_i({rep_i} + 1)
}
addi tiles_left, -4
bnez tiles_left, shift_tile_loop
addi a2, -4*4
shift_tile_loop_end:
bnez lines_left, shift_line_loop
addi lines_left, -1
} // end scope FineXBG
// DMA out BG
lli t0, dmem_dst
mtc0 t0, C0_MEM_ADDR
mtc0 a1, C0_DRAM_ADDR
lli t0, dst_sp-dst_bg-1
mtc0 t0, C0_WR_LEN
addi a1, dst_sp-dst_bg
// There's only one port to DMEM, so it's a good idea to wait for this DMA to
// finish before proceeding to the sprites.
-
mfc0 t0, C0_DMA_BUSY
bnez t0,-
nop
// ##### Sprites
// Zero out the lines
lli t0, dmem_dst+0x80
lli t1, 256*8
-
sqv v31[e0],0(t0)
addi t1, -16
bnez t1,-
addi t0, 16
// Convert to 8bpp
lli a2, dmem_src+src_sp_pat
lli a3, dmem_src+src_sp_atr
lli sp_s1, 8
lli sp_s2, dmem_src+src_sp_x
lli sp_s3, dmem_dst+0x80
lqv v11[e0], 0(a2)
sprite_loop:
macro sprite_convert(_76,_54,_32,_10) {
// Column 7,6
vmudl v13,v7,v29[e15]
vmadn v13,v7,v30[e15]
vand v13,v13,v28[e8]
vmudl v14,v6,v29[e14]
vand v14,v14,v28[e9]
vor {_76},v13,v14[e0]
// Column 5,4
vmudl v13,v5,v29[e13]
vmadn v13,v5,v30[e13]
vand v13,v13,v28[e8]
vmudl v14,v4,v29[e12]
vand v14,v14,v28[e9]
vor {_54},v13,v14[e0]
// Column 3,2
vmudl v13,v3,v29[e11]
vmadn v13,v3,v30[e11]
vand v13,v13,v28[e8]
vmudl v14,v2,v29[e10]
vand v14,v14,v28[e9]
vor {_32},v13,v14[e0]
// Column 1,0
vmudl v13,v1,v29[e9]
vmadn v13,v1,v30[e9]
vand v13,v13,v28[e8]
vmudl v14,v0,v29[e8]
vmadn v14,v0,v30[e8]
vand v14,v14,v28[e9]
vor {_10},v13,v14[e0]
// Add attributes
vmudl v24,v24,v8[e11]
vmudn v24,v24,v10[e8]
vor v24,v24,v10[e12] // 0x1010, sprite palette
vor {_76},{_76},v24[e0]
vor {_54},{_54},v24[e0]
vor {_32},{_32},v24[e0]
vor {_10},{_10},v24[e0]
}
// First line (with 2 lines we can do a full transpose)
lpv v24[e0], 0(a3) // attributes
select_bits(v11)
// Preload next line
lqv v11[e0], 16(a2)
sprite_convert(v16,v17,v18,v19)
// Second line
lpv v24[e0], 8(a3) // attributes
select_bits(v11)
// Preload next line
lqv v11[e0], 32(a2)
sprite_convert(v20,v21,v22,v23)
// So now we have
// v16: pixel 0,1 of 8 sprites on line 0
// v17: pixel 2,3
// v18: pixel 4,5
// v19: pixel 6,7
// v20: pixel 0,1 of 8 sprites on line 1
// v21: pixel 2,3, line 1
// v22: pixel 4,5, line 1
// v23: pixel 6,7, line 1
// (0.0,0.1),...
// (0.2,0.3),...
// (0.4,0.5),...
// (0.6,0.7),...
// (8.0,8.1),...
// (8.2,8.3),...
// (8.4,8.5),...
// (8.6,8.7),...
// The idea is to transpose this to
// (0.0,0.1),(0.2,0.3),(0.4,0.5),(0.6,0.7),(8.0,8.1),(8.2,8.3),(8.4,8.5),(8.6,8.7)
// ...
// v16: line 0 sprite 0, line 1 sprite 0
// v17: line 0 sprite 1, line 1 sprite 1
// v18: line 0 sprite 2, line 1 sprite 2
// v19: line 0 sprite 3, line 1 sprite 3
// v20: line 0 sprite 4, line 1 sprite 4
// v21: line 0 sprite 5, line 1 sprite 5
// v22: line 0 sprite 6, line 1 sprite 6
// v23: line 0 sprite 7, line 1 sprite 7
lli t0, dmem_dst
// Note: I'm not 100% sure that this is the layout in DMEM.
// v17[0],v18[1],v19[2],v20[3],v21[4],v22[5],v23[6],v16[7] -> 0x70
stv v16[e2], 0x70(t0)
stv v16[e4], 0x60(t0)
stv v16[e6], 0x50(t0)
stv v16[e8], 0x40(t0)
stv v16[e10], 0x30(t0)
stv v16[e12], 0x20(t0)
stv v16[e14], 0x10(t0)
// 0x70 -> v16[1],v17[2],v18[3],v19[4],v20[5],v21[6],v22[7],v23[0]
ltv v16[e14], 0x70(t0)
ltv v16[e12], 0x60(t0)
ltv v16[e10], 0x50(t0)
ltv v16[e8], 0x40(t0)
ltv v16[e6], 0x30(t0)
ltv v16[e4], 0x20(t0)
ltv v16[e2], 0x10(t0)
// Fill in each sprite
lbu t3, 7(sp_s2)
lbu t2, 6(sp_s2)
lbu t1, 5(sp_s2)
lbu t0, 4(sp_s2)
add t3, sp_s3
sdv v23[e0], 0(t3)
add t2, sp_s3
sdv v22[e0], 0(t2)
add t1, sp_s3
sdv v21[e0], 0(t1)
add t0, sp_s3
sdv v20[e0], 0(t0)
lbu t3, 3(sp_s2)
lbu t2, 2(sp_s2)
lbu t1, 1(sp_s2)
lbu t0, 0(sp_s2)
add t3, sp_s3
sdv v19[e0], 0(t3)
add t2, sp_s3
sdv v18[e0], 0(t2)
add t1, sp_s3
sdv v17[e0], 0(t1)
add t0, sp_s3
sdv v16[e0], 0(t0)
addi sp_s3, 256
addi sp_s2, 8
lbu t3, 7(sp_s2)
lbu t2, 6(sp_s2)
lbu t1, 5(sp_s2)
lbu t0, 4(sp_s2)
add t3, sp_s3
sdv v23[e8], 0(t3)
add t2, sp_s3
sdv v22[e8], 0(t2)
add t1, sp_s3
sdv v21[e8], 0(t1)
add t0, sp_s3
sdv v20[e8], 0(t0)
lbu t3, 3(sp_s2)
lbu t2, 2(sp_s2)
lbu t1, 1(sp_s2)
lbu t0, 0(sp_s2)
add t3, sp_s3
sdv v19[e8], 0(t3)
add t2, sp_s3
sdv v18[e8], 0(t2)
add t1, sp_s3
sdv v17[e8], 0(t1)
add t0, sp_s3
sdv v16[e8], 0(t0)
addi sp_s3, 256
addi sp_s2, 8
addi a2, 16*2
addi sp_s1, -2
bnez sp_s1,sprite_loop
addi a3, 8*2
// DMA out sprites
lli t0, dmem_dst+0x80
mtc0 t0, C0_MEM_ADDR
mtc0 a1, C0_DRAM_ADDR
lli t0, conv_dst_size-dst_sp-1
mtc0 t0, C0_WR_LEN
-
mfc0 t0, C0_DMA_BUSY
bnez t0,-
nop
addi a1, conv_dst_size-dst_sp
addi a0, conv_src_size
break
}
pullvar base
arch n64.cpu
align(4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment