bvibber/imul16.s

## imul16.s
; Lightly tested work in progress; imul16 for 6502
; two 16-bit inputs, one 32-bit output
; using the Atari floating point registers as argument placeholders
; ca65 syntax
; brion vibber, 2022

; FP registers in zero page
FR0 = $d4
FRE = $da
FR1 = $e0
FR2 = $e6
FRX = $ec

.code

.export start

; 2 + 8 * byte cycles
.macro neg bytes, arg
    sec ; 2 cyc
    .repeat bytes, byte ; 8 * byte cycles
        lda #00         ; 2 cyc
        sbc arg + byte  ; 3 cyc
        sta arg + byte  ; 3 cyc
    .endrepeat
.endmacro

; 18 cycles
.macro neg16 arg
    neg 2, arg
.endmacro

; 34 cycles
.macro neg32 arg
    neg 4, arg
.endmacro

; inner loop for imul16
; 24 to 44 cycles
.macro bitmul16 arg1, arg2, result, bitnum
    .local next

    ; does 16-bit adds
    ; arg1 must be 0 or positive
    ; arg2 must be 0 or positive

    clc ; 2 cyc

    ; check if arg1 has 0 or 1 bit in this place
    ; 5 cycles either way
    .if bitnum < 8
        lda arg1                 ; 3 cyc
        and #(1 << bitnum)       ; 2 cyc
    .else
        lda arg1 + 1             ; 3 cyc
        and #(1 << (bitnum - 8)) ; 2 cyc
    .endif
    beq next ; 2 cyc

    ; 16-bit add on the top bits
    lda result + 2 ; 3 cyc
    adc arg2       ; 3 cyc
    sta result + 2 ; 3 cyc
    lda result + 3 ; 3 cyc
    adc arg2 + 1   ; 3 cyc
    ror a          ; 2 cyc - get a jump on the shift
    sta result + 3 ; 3 cyc

    ; Shift the 32-bit result down by one bit,
    ; saving the previous carry.
    ror result + 3 ; 5 cyc
next:
    ror result + 2 ; 5 cyc
    ror result + 1 ; 5 cyc
    .if bitnum >= 8
        ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
        ; when it's all uninitialized data
        ror result ; 5 cyc
    .endif
.endmacro

; 5 to 25 cycles
.macro check_sign arg
    ; Check sign bit and flip argument to postive,
    ; keeping a count of sign bits in the X register.
    .local positive
    lda arg + 1   ; 3 cyc
    bpl positive  ; 2 cyc
    neg16 arg     ; 18 cyc
    inx           ; 2 cyc
positive:
.endmacro

; min 454 cycles
; max 756 cycles
.proc imul16
    arg1 = FR0   ; 16-bit arg (clobbered)
    arg2 = FR1   ; 16-bit arg (clobbered)
    result = FR2 ; 32-bit result

    ldx #0          ; 2 cyc
    ; counts the number of sign bits in X
    check_sign arg1 ; 5 to 25 cyc
    check_sign arg2 ; 5 to 25 cyc

    ; zero out the 32-bit temp's top 16 bits
    lda #0          ; 2 cyc
    sta result + 2  ; 3 cyc
    sta result + 3  ; 3 cyc
    ; the bottom two bytes will get cleared by the shifts

    ; unrolled loop for maximum speed, at the cost
    ; of a larger routine
    ; 424 to 672 cycles
    .repeat 16, bitnum
        ; first half: 24 to 40 cycles
        ; second half: 29 to 44 cycles
        bitmul16 arg1, arg2, result, bitnum
    .endrepeat

    ; In case of mixed input signs, return a negative result.
    cpx #1              ; 2 cyc
    bne positive_result ; 2 cyc
    neg32 result        ; 34 cyc
positive_result:

    rts ; 6 cyc
.endproc

.proc iter
    ; (cx and cy should be pre-scaled to 6.26 fixed point)
    ; zx = 0
    ; zy = 0
    ; zx_2 = 0
    ; zy_2 = 0
    ; zx_zy = 0

    ; still working on the fixed-point
loop:
    ; iters++

    ; 6.26:
    ; zx = zx_2 + zy_2 + cx
    ; zy = zx_zy + zx_zy + cy
    ; round to 6.10.

    ; 12.20:
    ; zx_2 = zx * zx
    ; zy_2 = zy * zy
    ; dist = zx_2 + zy_2
    ; if dist >= 4 break, else continue iterating

    ; round zx_2, zy_2, dist to 6.26

    ; if may be in the lake, look for looping output with a small buffer
    ; as an optimization vs running to max iters

.endproc

.proc start

loop:
    ; FR0 = 5
    ; FR1 = -3
    lda #5
    sta FR0
    lda #0
    sta FR0 + 1
    lda #$fd
    sta FR1
    lda #$ff
    sta FR1 + 1

    jsr imul16
    ; should have 32-bit -15 in FR2

    jmp loop
.endproc
	; Lightly tested work in progress; imul16 for 6502
	; two 16-bit inputs, one 32-bit output
	; using the Atari floating point registers as argument placeholders
	; ca65 syntax
	; brion vibber, 2022

	; FP registers in zero page
	FR0 = $d4
	FRE = $da
	FR1 = $e0
	FR2 = $e6
	FRX = $ec

	.code

	.export start

	; 2 + 8 * byte cycles
	.macro neg bytes, arg
	sec ; 2 cyc
	.repeat bytes, byte ; 8 * byte cycles
	lda #00 ; 2 cyc
	sbc arg + byte ; 3 cyc
	sta arg + byte ; 3 cyc
	.endrepeat
	.endmacro

	; 18 cycles
	.macro neg16 arg
	neg 2, arg
	.endmacro

	; 34 cycles
	.macro neg32 arg
	neg 4, arg
	.endmacro

	; inner loop for imul16
	; 24 to 44 cycles
	.macro bitmul16 arg1, arg2, result, bitnum
	.local next

	; does 16-bit adds
	; arg1 must be 0 or positive
	; arg2 must be 0 or positive

	clc ; 2 cyc

	; check if arg1 has 0 or 1 bit in this place
	; 5 cycles either way
	.if bitnum < 8
	lda arg1 ; 3 cyc
	and #(1 << bitnum) ; 2 cyc
	.else
	lda arg1 + 1 ; 3 cyc
	and #(1 << (bitnum - 8)) ; 2 cyc
	.endif
	beq next ; 2 cyc

	; 16-bit add on the top bits
	lda result + 2 ; 3 cyc
	adc arg2 ; 3 cyc
	sta result + 2 ; 3 cyc
	lda result + 3 ; 3 cyc
	adc arg2 + 1 ; 3 cyc
	ror a ; 2 cyc - get a jump on the shift
	sta result + 3 ; 3 cyc

	; Shift the 32-bit result down by one bit,
	; saving the previous carry.
	ror result + 3 ; 5 cyc
	next:
	ror result + 2 ; 5 cyc
	ror result + 1 ; 5 cyc
	.if bitnum >= 8
	; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte
	; when it's all uninitialized data
	ror result ; 5 cyc
	.endif
	.endmacro

	; 5 to 25 cycles
	.macro check_sign arg
	; Check sign bit and flip argument to postive,
	; keeping a count of sign bits in the X register.
	.local positive
	lda arg + 1 ; 3 cyc
	bpl positive ; 2 cyc
	neg16 arg ; 18 cyc
	inx ; 2 cyc
	positive:
	.endmacro

	; min 454 cycles
	; max 756 cycles
	.proc imul16
	arg1 = FR0 ; 16-bit arg (clobbered)
	arg2 = FR1 ; 16-bit arg (clobbered)
	result = FR2 ; 32-bit result

	ldx #0 ; 2 cyc
	; counts the number of sign bits in X
	check_sign arg1 ; 5 to 25 cyc
	check_sign arg2 ; 5 to 25 cyc

	; zero out the 32-bit temp's top 16 bits
	lda #0 ; 2 cyc
	sta result + 2 ; 3 cyc
	sta result + 3 ; 3 cyc
	; the bottom two bytes will get cleared by the shifts

	; unrolled loop for maximum speed, at the cost
	; of a larger routine
	; 424 to 672 cycles
	.repeat 16, bitnum
	; first half: 24 to 40 cycles
	; second half: 29 to 44 cycles
	bitmul16 arg1, arg2, result, bitnum
	.endrepeat

	; In case of mixed input signs, return a negative result.
	cpx #1 ; 2 cyc
	bne positive_result ; 2 cyc
	neg32 result ; 34 cyc
	positive_result:

	rts ; 6 cyc
	.endproc

	.proc iter
	; (cx and cy should be pre-scaled to 6.26 fixed point)
	; zx = 0
	; zy = 0
	; zx_2 = 0
	; zy_2 = 0
	; zx_zy = 0

	; still working on the fixed-point
	loop:
	; iters++

	; 6.26:
	; zx = zx_2 + zy_2 + cx
	; zy = zx_zy + zx_zy + cy
	; round to 6.10.

	; 12.20:
	; zx_2 = zx * zx
	; zy_2 = zy * zy
	; dist = zx_2 + zy_2
	; if dist >= 4 break, else continue iterating

	; round zx_2, zy_2, dist to 6.26

	; if may be in the lake, look for looping output with a small buffer
	; as an optimization vs running to max iters

	.endproc

	.proc start

	loop:
	; FR0 = 5
	; FR1 = -3
	lda #5
	sta FR0
	lda #0
	sta FR0 + 1
	lda #$fd
	sta FR1
	lda #$ff
	sta FR1 + 1

	jsr imul16
	; should have 32-bit -15 in FR2

	jmp loop
	.endproc