Skip to content

Instantly share code, notes, and snippets.

@mattgodbolt
Created August 22, 2019 22:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattgodbolt/6d010089f7693cdbac1165c471338881 to your computer and use it in GitHub Desktop.
Save mattgodbolt/6d010089f7693cdbac1165c471338881 to your computer and use it in GitHub Desktop.
A bit of Red Dog: Superior Firepower's rendering system. SH4 Assembly, circa 1998
; All new, all-singing, all-dancing textured scape renderer!
.CPU SH4
.OUTPUT DBG=DWARF
.INCLUDE "..\RedDog.pre"
; Prototype: StripHeader *texturedStripRasteriserClipped
; (StripPos *v, StripEntry *strip, Uint32 nStrip, ModelContext *context);
; Register allocation:
temp: .REG R0 ; temporary register, R0 is most accessible
retVal: .REG R0 ; r0 is also the return value
lightingBuf: .REG R1 ; the lighting buffer
curVertex: .REG R2 ; the number of the vertex in the 3-vert round-robin buffer
pkm: .REG R3 ; where to stick the next vertex, TA-wise
vertTab: .REG R4 ; address of the vertex table
strip: .REG R5 ; the strip pointer
nStrip: .REG R6 ; number of vertices left in this strip
context: .REG R7 ; pointer to the context
v: .REG R8 ; pointer to current vertex position
pcw: .REG R8 ; [olap] paramcontrolword
outcode: .REG R9 ; current outcode
temp2: .REG R10 ; another temporary register
diff: .REG R10 ; [olap] difference in outcoding
v0: .REG R10 ; [olap] vertex 0 pointer in clipping
mask: .REG R11 ; used as a mask, and a temporary register
v1: .REG R11 ; [olap] vertex 1 pointer in clipping
uv_u: .REG R12 ; u value
abs_w: .REG R12 ; [olap] absolute binary W
v2: .REG R12 ; [olap] vertex 2 pointer in clipping
uv_v: .REG R13 ; v value
t: .REG R13 ; [olap] used in outcoding
counter: .REG R13 ; [olap] used in clipping
col: .REG R14 ; colour value
temp3: .REG R14 ; [olap] more temp action
addrV1: .REG R14 ; [olap] vertex 1's position in the alpha
; Floating point madness
DEC_XYZW vert, 4, 5, 6, 7 ; vertex position
X: .FREG FR4
Y: .FREG FR5
Z: .FREG FR6
W: .FREG FR7 ; NOT tother way aroond
midX: .FREG FR8 ; screen midpoint X
midY: .FREG FR9 ; screen midpoint Y
mulX: .FREG FR10 ; screen size X
mulY: .FREG FR11 ; screen size Y
rW: .FREG FR0
alpha: .FREG FR0 ; [olap] alpha value
rWnearVal: .FREG FR1 ; rW at camera plane
U: .FREG FR2
V: .FREG FR3
; Offsets:
_X: .EQU 0
_Y: .EQU 4
_Z: .EQU 8
_W: .EQU 12
_U: .EQU 16
_V: .EQU 20
_preColour: .EQU 24
_dynColour: .EQU 28
_outcode: .EQU 32
_rW: .EQU 36
_scrY: .EQU 40
_scrX: .EQU 44
_padding: .EQU 48
.SECTION PSCAPERAST, CODE, ALIGN=16
; Outcode description:
; bit | meaning
; -----+-----------
; 0 | Off near
; 1 | Off left
; 2 | Off right
; 3 | Off top
; 4 | Off bottom
.MACRO pad
; NOP no nops for speed
.ENDM
; curVertex bits:
; lower 2 bits are the current vertex
; bit 3 is the parity, if set then the triangle is the 'right way around'
; bit 4 is the continuation flag, if clear then the previous two vertices need to be sent before 'this'
; Subroutine: Prepares a vertex read from strip into the buffer pointed to
; by R0
; Corrupts: everything barring the pointers
.ALIGN 16
PrepareVertex:
MOV.L @strip+, temp2 ; read in the vertex number
pad
pad
pad
MOV temp2, v ; move into the v register
SHLL2 v ; get v as an offset into the vertex table
ADD vertTab, v ; v now points at the vertex in question
FLDI1 vertW ; vertW all set for the transformation
FMOV.S @v+, vertX ; read in the X position
MOV #-1, mask ; get mask=0xffffffff
FMOV.S @v+, vertY ; read in Y position
SHLR mask ; mask is now 0x7fffffff
FMOV.S @v+, vertZ ; read in the Z position
ADD lightingBuf, temp2 ; find the address of the lightingBuf address
MOV.L @strip+, uv_u ; read the UV value from strip
pad
FTRV XMTRX, vert ; transform the vertex [5-7 cyc lat]
MOV.L @temp2, temp2 ; read the dynamic lighting value
FLDI1 rW ; get 1 in rW ready for reciprocation
ADD #32, strip ; move strip on for a prefetch
MOV.L temp2, @(_dynColour, R0); store in the dynamic colour
EXTU.W uv_u, uv_v ; clear the top 16 bits of uv to get v into v
MOV.L @v+, col
SHLR16 uv_u ; move u down to clear the bottom bits
SHLL16 uv_v ; shift v back up == VVVV0000
FLDI0 U ; load U ready for the comparison
MOV.L col, @(_preColour, R0) ; store in the prelit colour
SHLL16 uv_u ; shift U back up == UUUU0000
MOV.L uv_v, @(_V, R0) ; store V
pad
MOV.L uv_u, @(_U, R0) ; store U
ADD #_U, R0 ; move R0 past the point data
; FTRV ready here
FLDS W, FPUL ; get bin(W)
FCMP/GT Z, U ; is 0 > Z? ie is z <= 0 [4 cyc lat]
STS FPUL, abs_w ; abs_w is bin(W)
pad
FLDS X, FPUL ; load X into FPUL ready for bin(X)
AND mask, abs_w ; get abs(bin(W))
STS FPUL, t ; get bin(X) into t
MOV abs_w, diff ; diff = abs_w
FLDS Y, FPUL ; load Y into FPUL for bin(Y)
MOV t, temp3 ; blimey another temporary register
AND mask, temp3 ; temp3 is abs(bin(X))
FDIV W, rW ; get some reciprocation going now
MOVT outcode ; bit 0 is now Z <= 0 (near clip bit)
PREF @strip ; prefetch the next strip information
SHLL t ; T bit is top bit of bin(X)
FMOV.S W, @-R0 ; store W
MOVT t ; store T into the t
FMOV.S Z, @-R0 ; store Z
SUB temp3, diff ; diff = abs_w - abs(bin(X))
FMOV.S Y, @-R0 ; store Y
SHLL diff ; T bit = sign(abs_w - abs(bin(X)))
STS FPUL, temp3 ; temp3 is bin(Y)
ADD #1, t ; t is now 1 + sign of X
FMOV.S X, @-R0 ; store X
MOVT diff ; diff = T bit == sign(abs_w - abs(bin(X)))
FMUL mulX, X ; X = X * screen size X
SHLD t, diff ; diff = (abs_w - abs(bin(X))) << t
FMUL mulY, Y ; Y = Y * screen size Y
OR diff, outcode ; outcode |= 0, 2 or 4 depending of X outcode
pad
MOV temp3, t ; get a copy of bin(y)
AND mask, temp3 ; temp3 = abs(bin(y))
SHLL t ; get top bit into t == sign of y
FMOV.S midX, U ; U is mid X ready for the FMAC
MOVT t ; t == 1 or 0 depending on sgn (y)
FMOV.S midY, V ; V is mid Y ready for the FMAC
MOV abs_w, diff ; move abs_w into diff
SUB temp3, diff ; diff = abs_w - abs(bin(y))
SHLL diff ; T bit = sign(abs_w - abs(bin(y)))
FMAC rW, X, U ; U = midX + X * screen size * rW
ADD #3, t ; t = 3 or 4
FMAC rW, Y, V ; V = midY + Y * screen size * rW
MOVT diff ; diff = T bit = sign(abs_w - abs(bin(y)))
pad
ADD #_rW, R0 ; point R0 at 1/w's place
pad
SHLD t, diff ; diff = sign(abs_w - abs(bin(y))) << t
FMOV.S rW, @R0 ; store 1/w
OR diff, outcode ; outcode is now complete - huzzah!
pad
ADD #12, R0 ; point R0 at the transformed X and Y
FMOV.S U, @-R0 ; store transformed X
FMOV.S V, @-R0 ; store transformed Y
ADD #-_scrY, R0 ; move R0 back to the beginning
ADD #-32, strip ; move strip back to whence it came
RTS ; return, and as a slot:
MOV.L outcode, @(_outcode, R0); [slot] store the outcode back
; Support routine: Output a vertex
; vertex pointed to by r0, output is pkm
; leaves pkm pointing at the PCW, and with SQ requiring dispatch
; corrupts only FP and temp3
; sets pcw to be 0xe0000000
; returns (curVertex+temp3 % 3) in R0
.ALIGN 16
OutputVertex:
ADD #_U, R0 ; point r0 at the U value
ADD #32, pkm ; point at the end of the vertex
FMOV.S @R0+, U ; read U
FMOV.S @R0+, V ; read V
MOV #H'E, pcw ; pcw = 0xe
FMOV.S @R0+, vertX ; read prelighting
SHLL16 pcw ; pcw = 0xe<<16
FMOV.S @R0, vertY ; read dynamic lighting
ADD #8, R0 ; skip over to 1/w
FMOV.S @R0+, vertZ ; read rW
SHLL8 pcw ; pcw = 0xe<<24
FMOV.S @R0+, rW ; read screen X
SHLL2 pcw ; pcw = 0xe<<26
FMOV.S @R0+, vertW ; read screen Y
SHLL2 pcw ; pcw = 0xe<<28 == 0xe0000000
FMOV.S vertY, @-pkm ; store dynamic lighting
MOV curVertex, R0 ; get curvertex
FMOV.S vertX, @-pkm ; store prelighting
AND #3, R0 ; just get the vertex number
FMOV.S V, @-pkm ; store V
ADD temp3, R0 ; curVertex + temp3
MOV #3, temp3 ; check 3
FMOV.S U, @-pkm ; store U
CMP/GE temp3, R0 ; check for curVertex overflow
FMOV.S vertZ, @-pkm ; store 1/w
BF/S ?ok ; is it OK to skip
FMOV.S rW, @-pkm ; store screen Y [slot]
ADD #-3, R0 ; reset R0 to 0
?ok FMOV.S vertW, @-pkm ; store screen X
RTS
ADD #-4, pkm ; reset pkm to point at pcw
; Support routines: Output a vertex halfway between R0 and addrV1
; Corrupts all FP, PCW
OutputAlpha: ; first read Z for both to get the alpha calculation done first
ADD #_Z, R0 ; point R0 at v0.z
pad
ADD #_Z, addrV1 ; point addrV1 at v1.z
FMOV.S @R0, Z ; read v0.z into Z
FMOV.S @addrV1, alpha ; read v1.z into alpha
ADD #-_Z, R0 ; put r0 back to the beginning
FMOV.S @R0+, X ; read v0.x into X
ADD #-_Z, addrV1 ; move addrV1 back to the beginning also
FMOV alpha, W ; take a copy of v1.z into W
FSUB Z, W ; W = v1.z - v0.z
FMOV.S @R0+, Y ; read v0.y into Y
ADD #8, R0 ; move R0 to point at the UV values
FMOV.S @addrV1+, U ; U = v1.X
pad
FDIV W, alpha ; alpha = v1.z / (v1.z - v0.z) in about 12 cycle's time
FMOV.S @addrV1+, V ; V = v1.Y
FMOV X, W ; W is temporarily v0.X
FSUB U, W ; W = v0.X - v1.X
FMOV Y, Z ; Z is temporarily v0.Y
FSUB V, Z ; Z = v0.Y - v1.Y
; stall for 10 cycles here - travesty!
ADD #8, addrV1 ; move on to the uvs
FMAC alpha, Z, V ; V = v1.Y + alpha * (v1.Y - v0.Y)
FMAC alpha, W, U ; U = v1.X + alpha * (v1.X - v0.X)
pad
pad
pad
FMUL rWnearVal, V ; perspectivize Y
pad
FMUL rWnearVal, U ; perspectivize X
ADD #16, pkm ; move pkm on
FMUL mulY, V ; scale to fit on screen in Y
FMOV.S @R0+, X ; X is v0.u
FMUL mulX, U ; scale to fit on screen in X
FMOV.S @R0+, Y ; Y is v0.v
FADD midY, V ; V is a screen pos at last too
pad
FADD midX, U ; U is a screen pos at last
FMOV.S rWnearVal, @-pkm ; store 1/near clip value
FMOV.S V, @-pkm ; store Y value
pad
FMOV.S U, @-pkm ; store X value
;stall
FMOV.S @addrV1+, U ; read v1.U into U
FMOV.S @addrV1+, V ; read v1.V into V
FMOV X, Z ; copy v0.U into Z
FSUB U, Z ; Z = v0.U - v1.U
FMOV Y, W ; copy v0.V into W
FSUB V, W ; W = v0.V - v1.V
pad
pad
FMAC alpha, Z, U ; U = alpha * v1.U - v0.U + v1.U
FMAC alpha, W, V ; V = alpha * v1.V - v0.V + v1.V
ADD #20, pkm ; move pkm to the end of the UV area
pad
pad
FMOV.S V, @-pkm
FMOV.S U, @-pkm
PUSH outcode
PUSH temp2
ADD #16, pkm ; point at end of colour
PUSH t
MOV #4, temp2 ; loop counter is 4
ADD #7, R0 ; move to end of colour
ADD #7, addrV1 ; move to end of colour
MOV #0, outcode ; outcode
?colourLoop MOV.B @R0, pcw ; read a byte of colour into pcw
ADD #-1, R0 ; move down
EXTU.B pcw, pcw ; zero those top bits
MOV.B @addrV1, t ; read a second byte of colour
ADD #-1, addrV1 ; move down
LDS pcw, FPUL ; FPUL madness
EXTU.B t, t ; zero those top bits
FLOAT FPUL, X ; X is colour 1
SHLL8 outcode
CMP/EQ t, pcw ; are they the same?
LDS t, FPUL ; FPUL madness again
BT ?sameColours
FLOAT FPUL, Y ; Y is colour 2
FSUB Y, X ; X = (colour 1 - colour 2)
FMAC alpha, X, Y ; Y = (colour 1 - colour 2) * alpha
FTRC Y, FPUL ; FPUL is the colour
STS FPUL, pcw
?sameColours DT temp2
BF/S ?colourLoop
OR pcw, outcode ; store in the outcode madness
MOV.L outcode, @-pkm
; have we done?
MOV #7, t
TST t, pkm
BT ?hoorayTheEnd
BRA ?colourLoop
MOV #4, temp2
MOV #0, outcode
?hoorayTheEnd
POP t
POP temp2
POP outcode
RTS
ADD #-24, pkm ; reset pkm to the beginning
.EXPORT _texturedStripRasteriserClipped
.ALIGN 16
_texturedStripRasteriserClipped:
; Enormous stacking action
PREF @strip ; start by prefetching the strip information
PUSH R14
PUSH R13
PUSH R12
PUSH R11
PUSH R10
PUSH R9
PUSH R8
STS.L PR,@-R15 ; end of stacking
MOV.L @context, pkm ; read pkm from context
ADD #8, context ; skip matnum
FMOV.S @context+, midX ; read midX
MOV #96, R0
FMOV.S @context+, midY ; read midY
ADD #96, R0 ; R0 is now 192
MOV.L @context+, lightingBuf ; read lightingBuf
SUB R0, R15 ; rewind stack enough for three vertex buffers
FMOV.S @context+, mulX ; read mulX
MOV #2, curVertex ; read the coming vertex into vertbuf2
FMOV.S @context+, mulY
ADD #-2, nStrip
FMOV.S @context, rWnearVal
ADD #-7*4, context ; rewind context
; Read in the first two vertices : (nStrip must be pre-decremented and curVertex 2)
?bigLoop:
BSR PrepareVertex ; vertex #1
MOV R15, R0 ; buffer @sp+0 [slot]
MOV R15, R0 ; vertex #2
BSR PrepareVertex ; buffer @sp+64
ADD #64, R0 ; [slot]
.AIF \&COUNT_GEOMETRY EQ 1
MOV.L #a_nDrawn, temp2
MOV.L @temp2, temp2
MOV.L @temp2, R0
ADD nStrip, R0
MOV.L R0, @temp2
.AENDI
?loop ; Read in another vertex at the current vertex position
MOV curVertex, R0 ; r0 = curVertex
AND #3, R0 ; ensure only vertex number is around
SHLL8 R0 ; r0 = curVertex * 256
pad
SHLR2 R0 ; r0 = curVertex * 64
BSR PrepareVertex ; prepare that vertex
ADD R15, R0 ; [slot] r0 is address of new vertex now
; incrememnt curVertex, and toggle parity, ugly bit of code
MOV curVertex, R0
AND #3, R0
ADD #1, R0
CMP/EQ #3, R0
BF ?ok
MOV #0, R0
?ok MOV #-4,t ; not 3
AND t, curVertex
OR curVertex, R0 ; curVertex incremented
XOR #(1<<3), R0 ; toggle parity
MOV R0, curVertex
; Now check the outcode; outcode will be A|B|C, temp A&B&C
MOV.L @(_outcode, R15), outcode ; outcode is A
MOV R15, temp3
MOV outcode, temp ; temp is going to be the AND version
ADD #64, temp3
MOV.L @(_outcode, temp3), temp2 ; temp2 is B
ADD #64, temp3
MOV.L @(_outcode, temp3), temp3 ; temp3 is C
OR temp2, outcode ; outcode |= B
AND temp2, temp ; temp &= B
OR temp3, outcode ; outcode |= C
AND temp3, temp ; temp &= C
CMPZ temp ; is temp zero; if not, we're totally offscreen
pad
BF ?offScreen ; branch if not zero to off screen
MOV outcode, temp
TST #1, temp ; is any of this triangle nearclipped?
pad
BF ?clipPoly ; if nearclipped, go ahead and clip it
?onScreen ; check to see if this is the first polygon
MOV #(1<<4), R0
TST R0, curVertex ; check continuation bit
pad
BF ?continueStrip ; if continuation bit is 1, continue the strip
MOV #(1<<3), R0
TST R0, curVertex ; check parity bit
pad
BF ?parityOK ; correct parity, so carry on
; we're going to output a dummy vertex of curVertex + 1
MOV curVertex, R0 ; r0 = curVertex
AND #3, R0 ; ensure only vertex number is around
ADD #1, R0 ; + 1
CMP/EQ #3, R0 ; ovf?
BF ?parityovfOK
MOV #0, R0
?parityovfOK
SHLL8 R0 ; r0 = curVertex+1 * 256
pad
SHLR2 R0 ; r0 = curVertex+1 * 64
BSR OutputVertex ; and output vertex 0 of the triangle
ADD R15, R0 ; [slot] point r0 in the right place
MOV pcw, @pkm
pad
PREF @pkm
ADD #32, pkm
?parityOK
; Now we have to output the first two polygons in a strip
; That is, polygons at curVertex, curVertex+1 and curVertex+2
MOV curVertex, R0 ; r0 = curVertex
AND #3, R0 ; ensure only vertex number is around
SHLL8 R0 ; r0 = curVertex * 256
pad
MOV #1, temp3 ; add one
pad
SHLR2 R0 ; r0 = curVertex * 64
BSR OutputVertex ; and output vertex 0 of the triangle
ADD R15, R0 ; [slot] point r0 in the right place
; store in the pcw and dispatch
MOV.L pcw, @pkm ; store e0000000
SHLL8 R0 ; R0 is (curVertex+1)<<8
PREF @pkm ; SQ blast!
ADD #32, pkm ; move on
MOV #2, temp3 ; add two this time
pad
SHLR2 R0 ; R0 is (curVertex+1)<<6
BSR OutputVertex ; output vertex 1
ADD R15, R0 ; [slot] move to the right plaice
; store in the pcw and dispatch
MOV.L pcw, @pkm ; store e0000000
SHLL8 R0 ; R0 is (curVertex+2)<<8
PREF @pkm ; SQ blast!
ADD #32, pkm ; move on
SHLR2 R0 ; R0 is (curVertex+2)<<6
BSR OutputVertex ; output vertex 1
ADD R15, R0 ; [slot] move to the right plaice
; leave the vertex hanging in the pipe, as we may be able to extend it, we may not, you never know
MOV #(1<<4), R0 ; get continuation bit
OR R0, curVertex ; curVertex has the continuation bit set
DT nStrip
BF ?loop
?endStripCont ; if we got here, then we have to terminate the poly in the current pipe
; pcw should be 0xe0000000
SHAR pcw ; e0000000 -> f0000000
MOV.L pcw, @pkm ; store terminator
PREF @pkm ; SQ blast
ADD #32, pkm ; move on to next
?end
MOV.L @strip+, nStrip ; read num strip
MOV.L @strip+, temp2 ; read material
MOV.L @(4, context), temp3 ; read should be material
CMPZ nStrip ; is temp zero?
BT ?reallyTheEnd
ADD #-2, nStrip
CMP/EQ temp2, temp3 ; is it the same material?
BT/S ?bigLoop ; yes? wicked!
MOV #2, curVertex ; reset curVertex [slot]
?reallyTheEnd MOV.L pkm, @context ; store back the pkm
MOV.L #192, R0 ; 192 = 3*64
ADD R0, R15 ; stack sortout
LDS.L @R15+, PR ; unstacking action
MOV strip, R0 ; get the return value
POP R8
ADD #-8, R0 ; rewind the retVal
POP R9
POP R10
POP R11
POP R12
POP R13
RTS
POP R14
; the polygon is offscreen, DT and reloop,
?offScreen ; Check for some flushing action
MOV #(1<<4), R0
TST R0, curVertex ; check continuation bit
pad
MOV.L #H'F0000000, pcw
BT ?noFlush ; no need to flush
MOV.L pcw, @pkm ; store terminator
PREF @pkm ; SQ blast
ADD #32, pkm ; move on to next
MOV #(1<<4), R0
XOR R0, curVertex ; clear continuation bit
?noFlush DT nStrip
BF ?loop
BRA ?end ; else end
NOP
; We've already outputted some vertices, and this is onscreen too,
; so SQ the pending vertex, and then output curVertex + 2 (== curVertex-1)
?continueStrip MOV.L #H'e0000000, pcw ; get e*
MOV curVertex, R0
AND #3, R0
pad
ADD #2, R0 ; add two
pad
MOV #2, temp2
MOV.L pcw, @pkm ; PCW previous vertex
CMP/GT temp2, R0 ; r0 > temp2 ? : ovf?
PREF @pkm ; SQ blast previous vertex [slot]
BF/S ?contovfOK
ADD #32, pkm
ADD #-3, R0 ; get R0 ok
?contovfOK SHLL8 R0
SHLR2 R0 ; get vert*64
BSR OutputVertex ; output that vertex
ADD R15, R0 ; [slot] address add
DT nStrip ; finished strip?
BT ?endStripCont ; yes?...we need to flush
BRA ?loop ; no, lets go round again
NOP
; Time to clip the triangle[cur, cur+1, cur+2]
; Set up the vertex pointers correspondingly
?clipPoly:
MOV #(1<<4), R0
TST R0, curVertex ; check continuation bit
pad
MOV.L #H'F0000000, pcw
BT ?noFlush2 ; no need to flush
MOV.L pcw, @pkm ; store terminator
PREF @pkm ; SQ blast
ADD #32, pkm ; move on to next
XOR R0, curVertex ; clear continuation bit
?noFlush2
MOV curVertex, R0 ; r0 is curVertex
TST #(1<<3), R0 ; parity even or odd?
BF/S ?evenParity
?oddParity: AND #3, R0 ; get R0 as just current vertex number
; If we got here, then we need v0=cur+1, v1=cur, v2=cur+2
MOV R0, v1
SHLL8 v1 ; v1 = R0*256
SHLR2 v1 ; v1 = R0*64
pad
ADD R15, v1 ; v1 = address of vertex 0
ADD #1, R0 ; move R0 on
CMP/EQ #3, R0 ; overflow?
BF ?skipOddOvf1
MOV #0, R0
?skipOddOvf1
MOV R0, v0 ; v0 = vertex 1
SHLL8 v0
SHLR2 v0 ; v0 = vertex1 * 64
ADD R15, v0 ; v0 is address of vertex 1
ADD #1, R0 ; move R0 on
CMP/EQ #3, R0 ; ovf?
BF ?lastVertexAddr ; no, get last vertex into v2
BT ?lastVertexOvf ; yes, flip round and get into v2
; poly is in the right order, v0=0,v1=1,v2=2
?evenParity
MOV R0, v0
SHLL8 v0 ; v0 = R0*256
SHLR2 v0 ; v0 = R0*64
pad
ADD R15, v0 ; v0 = address of vertex 0
ADD #1, R0 ; move R0 on
CMP/EQ #3, R0 ; overflow?
BF ?skipEvenOvf1
MOV #0, R0
?skipEvenOvf1
MOV R0, v1 ; v1 = vertex 1
SHLL8 v1
SHLR2 v1 ; v1 = vertex1 * 64
ADD R15, v1 ; v1 is address of vertex 1
ADD #1, R0 ; move R0 on
CMP/EQ #3, R0 ; ovf?
BF ?lastVertexAddr ; no ovf
?lastVertexOvf MOV #0, R0 ; reset to 0
?lastVertexAddr MOV R0, v2
SHLL8 v2 ; v2 = 256*v2
SHLR2 v2
ADD R15, v2 ; v2 now points in the right place
; lets see whether this clips to a triangle or a quad
; a triangle can be blasted straight to the TA, but a quad needs
; temporary buffering and the vertices sent in 0132 order, annoyingly
; a triangle clips to a quad IFF only one vertex is off the near
; the case where all points are off the near has already been handled, so either
; one or two vertices are offscreen: a XOR will tell us the oddness or evenness
MOV.L @(_outcode, v0), outcode; outcode for v0
MOV.L @(_outcode, v1), temp ; outcode for v1
MOV.L @(_outcode, v2), v ; outcode for v2
XOR temp, outcode ; outcode = v0^v1
XOR v, outcode ; outcode = v0^v1^v2
SHLR outcode ; get bottom bit into T bit
BF ?clipToTri ; if the bit is set, an odd number of nearclipped verts == 1
BRA ?clipToQuad
NOP
; We've ascertained the output will be a triangle, so do each edge
?clipToTri:
MOV @(_outcode, v0), outcode; get v0's outcode
MOV #3, counter ; set up the vertex counter
pad
MOV outcode, R0
SHLR R0 ; get T as the nearclip bit
BT ?v0OffScreen
BSR OutputVertex ; Output v0
MOV v0, R0 ; [slot] point r0 at v0
MOV.L pcw, @pkm ; store PCW
PREF @pkm ; sq blast
ADD #32, pkm ; move along
DT counter ; counter can't possibly go to zero here, so no check
?v0OffScreen MOV @(_outcode, v1), temp ; read v1's outcode
pad
pad
pad
XOR temp, outcode ; outcode = v0 ^ v1
SHLR outcode ; get bottom bit into T
BF/S ?no0to1alpha ; if one onscreen and not the other, need to output alpha vertex
MOV temp, outcode ; put v1's outcode into 'outcode'
; We need to output the 0-1 alpha vertex
MOV v0, R0 ; r0 points to the first vertex
BSR OutputAlpha
MOV v1, addrV1 ; addrv1 is the second vertex
DT counter ; can't go to zero here
MOV.L #H'e0000000, pcw
MOV.L pcw, @pkm ; store the e0000000
PREF @pkm
ADD #32, pkm ; blast and move on
?no0to1alpha MOV outcode, R0 ; get outcode of v1 into R0
SHLR R0 ; get clip bit into T
BT ?v1OffScreen ; is v1 off screen?
BSR OutputVertex
MOV v1, R0 ; output v1
DT counter ; is this the last polygon?
BF ?notTheLast
SHAR pcw ; e0000000 to f0000000
?notTheLast MOV.L pcw, @pkm ; store PCW
PREF @pkm ; sq blast
ADD #32, pkm ; move along
?v1OffScreen MOV @(_outcode, v2), temp ; read v2's outcode
pad
pad
pad
XOR temp, outcode ; get v1^v2
SHLR outcode ; T bit action
BF/S ?no1to2alpha ; no need to output an alpha vertex then
MOV temp,outcode ; get v2's outcode ready for the next bit
; we Need to output a v1-v2 alpha
MOV v1, R0
BSR OutputAlpha
MOV v2, addrV1 ; [s] output alpha value
MOV.L #H'E0000000, pcw
DT counter ; check to see if this is the last
BF ?notTheLast2
SHAR pcw ; e0 -> f00000000
?notTheLast2 MOV.L pcw, @pkm
PREF @pkm
ADD #32, pkm
?no1to2alpha MOV outcode, R0 ; get v2's outcode into R0
SHLR R0 ; T= offscreenness of v2
BT ?v2OffScreen
BSR OutputVertex
MOV v2, R0 ; output v2
DT counter ; is this the last polygon?
BF ?notTheLast3
SHAR pcw ; e0000000 to f0000000
?notTheLast3 MOV.L pcw, @pkm ; store PCW
PREF @pkm ; sq blast
ADD #32, pkm ; move along
?v2OffScreen MOV @(_outcode, v0), temp ; read v0's outcode
pad
pad
pad
XOR temp, outcode
pad
SHLR outcode ; check v0^v2
BF ?noFlushjumper ; no more to do? Loop all the way back!
; we Need to output a v2-v0 alpha
MOV v2, R0
BSR OutputAlpha
MOV v0, addrV1 ; [s] output alpha value
MOV.L #H'f0000000, pcw ; definitely the last
MOV.L pcw, @pkm
PREF @pkm
BRA ?noFlush
ADD #32, pkm ; [slot]
?noFlushjumper
BRA ?noFlush
NOP
; //////////////////////////////////////////////////////////////////////////////
.ALIGN 16
?clipToQuad
MOVA ?QuadBuffer, R0
PUSH pkm
MOV R0, pkm ; point pkm at a buffer for now
MOV @(_outcode, v0), outcode; get v0's outcode
MOV outcode, R0
SHLR R0 ; get T as the nearclip bit
BT ?v0OffScreenQ
BSR OutputVertex ; Output v0
MOV v0, R0 ; [slot] point r0 at v0
ADD #32, pkm
?v0OffScreenQ MOV @(_outcode, v1), temp ; read v1's outcode
pad
pad
pad
XOR temp, outcode ; outcode = v0 ^ v1
SHLR outcode ; get bottom bit into T
BF/S ?no0to1alphaQ ; if one onscreen and not the other, need to output alpha vertex
MOV temp, outcode ; put v1's outcode into 'outcode'
; We need to output the 0-1 alpha vertex
MOV v0, R0 ; r0 points to the first vertex
BSR OutputAlpha
MOV v1, addrV1 ; addrv1 is the second vertex
ADD #32, pkm
?no0to1alphaQ MOV outcode, R0 ; get outcode of v1 into R0
SHLR R0 ; get clip bit into T
BT ?v1OffScreenQ ; is v1 off screen?
BSR OutputVertex
MOV v1, R0 ; output v1
ADD #32, pkm
?v1OffScreenQ MOV @(_outcode, v2), temp ; read v2's outcode
pad
pad
pad
XOR temp, outcode ; get v1^v2
SHLR outcode ; T bit action
BF/S ?no1to2alphaQ ; no need to output an alpha vertex then
MOV temp,outcode ; get v2's outcode ready for the next bit
; we Need to output a v1-v2 alpha
MOV v1, R0
BSR OutputAlpha
MOV v2, addrV1 ; [s] output alpha value
ADD #32, pkm
?no1to2alphaQ MOV outcode, R0 ; get v2's outcode into R0
SHLR R0 ; T= offscreenness of v2
BT ?v2OffScreenQ
BSR OutputVertex
MOV v2, R0 ; output v2
ADD #32, pkm
?v2OffScreenQ MOV @(_outcode, v0), temp ; read v0's outcode
pad
pad
pad
XOR temp, outcode
pad
SHLR outcode ; check v0^v2
BF ?endQuadClip ; no more to do? Loop all the way back!
; we Need to output a v2-v0 alpha
MOV v2, R0
BSR OutputAlpha
MOV v0, addrV1 ; [s] output alpha value
;no need ADD #32, pkm
; Now to output the vertices in 0132 order
?endQuadClip POP pkm
MOVA ?QuadBuffer, R0 ; get true address in r0
MOV.L #H'E0000000, pcw ; pcw is 0xe0000000
FSCHG ; change to nice big stores
; first vertex :
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
MOV.L pcw, @pkm ; overwrite pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
ADD #8, pkm
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
PREF @pkm
ADD #8, pkm
; second vertex
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
MOV.L pcw, @pkm ; overwrite pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
ADD #8, pkm
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
PREF @pkm
ADD #8, pkm
ADD #8*4, R0 ; skip to fourth vertex
; third vertex
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
MOV.L pcw, @pkm ; overwrite pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
ADD #8, pkm
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
PREF @pkm
ADD #8, pkm
ADD #-8*4*2, R0 ; rewind back to third vertex
SHAR pcw ; f00000000
; last vertex
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
MOV.L pcw, @pkm ; overwrite pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
ADD #8, pkm
FMOV.D @R0+, vertXY
FMOV.D @R0+, vertZW
FMOV.D vertXY, @pkm
ADD #8, pkm
FMOV.D vertZW, @pkm
PREF @pkm
FSCHG
BRA ?noFlush ; and loop back
ADD #8, pkm
.ALIGN 16
?QuadBuffer .RES.L 8*4
.AIF \&COUNT_GEOMETRY EQ 1
.IMPORT _nDrawn
a_nDrawn: .DATA.L _nDrawn
.AENDI
.END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment