Skip to content

Instantly share code, notes, and snippets.

@brion
Last active Dec 31, 2022
Embed
What would you like to do?
imul16 for 6502
; Lightly tested work in progress; imul16 for 6502
; two 16-bit inputs, one 32-bit output
; using the Atari floating point registers as argument placeholders
; ca65 syntax
; brion vibber, 2022
; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec
.code
.export start
; 2 + 8 * byte cycles
.macro neg bytes, arg
sec ; 2 cyc
.repeat bytes, byte ; 8 * byte cycles
lda #00 ; 2 cyc
sbc arg + byte ; 3 cyc
sta arg + byte ; 3 cyc
.endrepeat
.endmacro
; 18 cycles
.macro neg16 arg
neg 2, arg
.endmacro
; 34 cycles
.macro neg32 arg
neg 4, arg
.endmacro
; inner loop for imul16
; 24 to 44 cycles
.macro bitmul16 arg1, arg2, result, bitnum
.local next
; does 16-bit adds
; arg1 must be 0 or positive
; arg2 must be 0 or positive
clc ; 2 cyc
; check if arg1 has 0 or 1 bit in this place
; 5 cycles either way
.if bitnum < 8
lda arg1 ; 3 cyc
and #(1 << bitnum) ; 2 cyc
.else
lda arg1 + 1 ; 3 cyc
and #(1 << (bitnum - 8)) ; 2 cyc
.endif
beq next ; 2 cyc
; 16-bit add on the top bits
lda result + 2 ; 3 cyc
adc arg2 ; 3 cyc
sta result + 2 ; 3 cyc
lda result + 3 ; 3 cyc
adc arg2 + 1 ; 3 cyc
ror a ; 2 cyc - get a jump on the shift
sta result + 3 ; 3 cyc
; Shift the 32-bit result down by one bit,
; saving the previous carry.
ror result + 3 ; 5 cyc
next:
ror result + 2 ; 5 cyc
ror result + 1 ; 5 cyc
.if bitnum >= 8
; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
; when it's all uninitialized data
ror result ; 5 cyc
.endif
.endmacro
; 5 to 25 cycles
.macro check_sign arg
; Check sign bit and flip argument to postive,
; keeping a count of sign bits in the X register.
.local positive
lda arg + 1 ; 3 cyc
bpl positive ; 2 cyc
neg16 arg ; 18 cyc
inx ; 2 cyc
positive:
.endmacro
; min 454 cycles
; max 756 cycles
.proc imul16
arg1 = FR0 ; 16-bit arg (clobbered)
arg2 = FR1 ; 16-bit arg (clobbered)
result = FR2 ; 32-bit result
ldx #0 ; 2 cyc
; counts the number of sign bits in X
check_sign arg1 ; 5 to 25 cyc
check_sign arg2 ; 5 to 25 cyc
; zero out the 32-bit temp's top 16 bits
lda #0 ; 2 cyc
sta result + 2 ; 3 cyc
sta result + 3 ; 3 cyc
; the bottom two bytes will get cleared by the shifts
; unrolled loop for maximum speed, at the cost
; of a larger routine
; 424 to 672 cycles
.repeat 16, bitnum
; first half: 24 to 40 cycles
; second half: 29 to 44 cycles
bitmul16 arg1, arg2, result, bitnum
.endrepeat
; In case of mixed input signs, return a negative result.
cpx #1 ; 2 cyc
bne positive_result ; 2 cyc
neg32 result ; 34 cyc
positive_result:
rts ; 6 cyc
.endproc
.proc iter
; (cx and cy should be pre-scaled to 6.26 fixed point)
; zx = 0
; zy = 0
; zx_2 = 0
; zy_2 = 0
; zx_zy = 0
; still working on the fixed-point
loop:
; iters++
; 6.26:
; zx = zx_2 + zy_2 + cx
; zy = zx_zy + zx_zy + cy
; round to 6.10.
; 12.20:
; zx_2 = zx * zx
; zy_2 = zy * zy
; dist = zx_2 + zy_2
; if dist >= 4 break, else continue iterating
; round zx_2, zy_2, dist to 6.26
; if may be in the lake, look for looping output with a small buffer
; as an optimization vs running to max iters
.endproc
.proc start
loop:
; FR0 = 5
; FR1 = -3
lda #5
sta FR0
lda #0
sta FR0 + 1
lda #$fd
sta FR1
lda #$ff
sta FR1 + 1
jsr imul16
; should have 32-bit -15 in FR2
jmp loop
.endproc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment