ped7g/erastothenes2.asm

## erastothenes2.asm
; Author: Ped7g ; (C) 2022 ; license: MIT / GPL / Public Domain (pick whichever fits best for you)
; assembled with: https://github.com/z00m128/sjasmplus/
; based on https://github.com/MartinezTorres/z80_babel project, written as comparison
;
; This is second version of the sieve routine (asm-like asm, focusing on performance, hard-wired buffer, etc...)
; For first C-like version check: https://gist.github.com/ped7g/c55bfa0d55ca13ce029549636cdd1de5
;
; sieve routine sieve_a_2 starts around line 110, below test harness

; comment/uncomment the case you wan to debug:

primes_size EQU     3100

work_size   EQU     12998 /* or 12999*/ ; enough for 2860 primes, ending with 25999 (same as 26000 buffer for sieve_c_1)
do_case     EQU     caseCrc     ; verification of actual primes list (2860 primes CRCed)

; work_size   EQU     1024        ; like 2048 for sieve_c_1 ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039)
; do_case     EQU     caseZ80babel    ; takes approx 46ms to finish

; work_size   EQU     430         ; hand tuned to fit single frame in CSpect emulator
; do_case     EQU     caseTime
; ; sieve_a_1 fits 1 frame in CSpect at work_size 500, primes: 0x005F (95), last prime 0x01F3 (499)
; ; sieve_a_2 fits 1 frame in CSpect at work_size 430, primes: 0x0096 (150), last prime 0x035F (863)

    DEFINE ORG_ADR $8080    ; using only uncontended faster memory (can't fit the maximum size buffer for case below)

; primes_size EQU     4               ; 8 bytes ahead of "work" buffer, using overlap of the two
; work_size   EQU     32766           ; check all 16b numbers up to 65535
; do_case     EQU     caseZ80babel    ; 6542 primes, last prime 65521, takes about 1.8sec to generate
;
;     DEFINE ORG_ADR $7E00    ; for work_size 32766, to have enough room for work buffer

;----------------------------------------------------------------------------------------------------------------------
; test harness to debug the code in emulator

    OPT --syntax=abf
    DEVICE ZXSPECTRUM48,ORG_ADR-1
    ORG     ORG_ADR
test_start:
    jp      do_case

caseZ80babel:
    ld      a,6
    out     (254),a
    call    sieve_a_2
    nop     ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039)
    ld      a,4
    out     (254),a
    jr      $

caseCrc:
    ld      a,5
    out     (254),a
    call    sieve_a_2
    nop     ; expected result HL=0x0B2C (2860), primes[0] = ..., 0x658F (25999)

    ; check result if it's as expected
    ld      a,6
    out     (254),a
    push    hl
    add     hl,hl
    ld      bc,hl           ; fake
    ld      de,0            ; crc add:xor
    ld      hl,primes
.crc
    ld      a,e
    xor     (hl)
    ld      e,a
    xor     d
    add     a,(hl)
    ld      d,a
    inc     hl
    dec     bc
    ld      a,b
    or      c
    jr      nz,.crc
    ld      a,4
    ld      hl,0x0DEB
    or      a
    sbc     hl,de
    jr      z,.ok0
    ld      a,2
.ok0:
    pop     hl
    ld      de,0x0B2C
    or      a
    sbc     hl,de
    jr      z,.ok1
    ld      a,2
.ok1:
    ld      hl,25999
    ld      de,(ix-2)       ; fake
    or      a
    sbc     hl,de
    jr      z,.ok2
    ld      a,2
.ok2:
    out     (254),a
    jr      $

caseTime:
    ei
    halt
    ld      a,5
    out     (254),a
    call    sieve_a_2
    xor     a
    out     (254),a
    jr      caseTime

;----------------------------------------------------------------------------------------------------------------------
; 138B hand written asm sieve optimised for performance, taking many shortcuts, using odd-number-only sieve
; "work" can overlap "primes" starting from primes+8 address (to maximise amount of primes calculated and returned)
sieve_a_2:

    ASSERT high work && 2 <= work_size
    ASSERT (primes + 8 <= work) || (work + work_size <= primes)
    ASSERT work + work_size <= 0xFFFF   ; if work buffer would have last byte at 0xFFFF, the calculation of "i" breaks

    ; logical : memory address offset into work buffer, storing only odd numbers sieve
    ; work[3] : -1 (not stored in memory at all)
    ; work[5] : +0
    ; work[7] : +1
    ; work[9] : +2

    ; *p++ = 2;
    ld      ix,primes
    xor     a
    ld      (ix+0),2
    ld      (ix+1),a
    inc     ix
    inc     ix

    ; clear the work buffer (full clear, first byte of buffer is marker for prime 5)
    ; for (uint16_t i = 0; i < work_size; i++) work[i] = 0x00;
    ld      hl,work
    ld      de,work+1
    ld      bc,work_size-1
    ld      (hl),a
    ldir

    ld      hl,work
    ld      bc,work_size

    ; mark prime 3 without searching by starting .mark_and_loop at work[5]

.mark_and_loop:
    ; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0)
    ex      de,hl
    ; i = 2*(hl - work + 1) + 1
    ld      hl,1-work
    add     hl,de       ; always cf=1 when 0xFFFF is not part of work buffer
    adc     hl,hl

    ; *p++ = i;
    ld      (ix+0),l
    ld      (ix+1),h
    inc     ix
    inc     ix

    ; IX = p, HL = i, DE = work[i+2], BC = bytes to check (BTW DE+BC = work.end(), but unable to use it)

    ; mark work buffer at all multiplies of `i` (inner loop setting the sieve)
    ; for (uint16_t j = i + i + i; j < work_size; j += i+i) work[j] = 1;
    push    bc          ; preserve bytes to check
    push    de          ; preserve work[i+2]
    ld      bc,-(work+work_size)
    add     hl,bc       ; HL = i-(work+work_size)   ; constant to check buffer overrun and add i+i to "j"
    jr      c,.inner_setup_ov   ; when this happens, it happens for all following primes -> faster final loop
    ex      de,hl
    dec     hl          ; HL = work[i], DE = i-(work+work_size), BC = -(work+work_size)
    add     hl,de       ; j = i + i + i (adjusting work[j] address), plus test for work_size, must not overflow
    jr      c,.inner_setup_ov   ; when this happens, it happens for all following primes -> faster final loop
.inner_loop:
    sbc     hl,bc       ; restore hl to hl+j
    ld      (hl),h      ; work[j] = non_zero_value
    add     hl,de       ; hl += i + test for work_size, must not overflow
    jp      nc,.inner_loop
    pop     hl          ; restore work[i+2] pointer = next to check
    pop     bc          ; restore bytes to check

    ; outer_loop ; for (uint16_t i = 3; i < work_size; i+=2)
    cpir
    jp      pe,.mark_and_loop   ; zf=1, pv=1
    ; pv=0 case is UNREACHABLE ; should never happen, because sooner the inner loop setup will overflow

    ; outer_loop phase 2, when marking primes is always impossible, (j >= work_size) == true because of large i
.inner_setup_ov:
    pop     hl          ; restore work[i+2] pointer = next to check
    pop     bc          ; restore bytes to check
    jp      .final_loop_entry

.final_loop:
    ; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0)
    ex      de,hl
    ; i = 2*(hl - work + 1) + 1
    ld      hl,1-work
    add     hl,de       ; always cf=1 when 0xFFFF is not part of work buffer
    adc     hl,hl
    ; *p++ = i;
    ld      (ix+0),l
    ld      (ix+1),h
    inc     ix
    inc     ix
    ex      de,hl       ; restore HL to work[i+2] and continue with final loop
.final_loop_entry:
    cpir
    jp      pe,.final_loop  ; zf=1, pv=1
    jr      nz,.final_loop_done ; zf=0, pv=0 (last work[i] was not prime)

    ; zf=1, pv=0 (match at last byte) - store last prime
    ld      de,1-work
    add     hl,de       ; always cf=1 when 0xFFFF is not part of work buffer
    adc     hl,hl
    ; *p++ = i;
    ld      (ix+0),l
    ld      (ix+1),h
    inc     ix
    inc     ix

.final_loop_done:
    ; return p-primes;
    ld      a,ixl
    sub     low(primes)
    ld      l,a
    ld      a,ixh
    sbc     a,high(primes)
    rra
    ld      h,a
    rr      l           ; HL = number of primes
    ret

    DISPLAY "sieve_a_2 code size: ",/A,$-sieve_a_2

;----------------------------------------------------------------------------------------------------------------------
; "primes" and "work" buffers

primes:     ds      2*primes_size,$AA
work:       ds      work_size,$BB

    db      'X'         ; the work buffer can't use 0xFFFF address, so this is guardian byte making assembling fail

    SAVESNA "sieve2.sna", test_start     ;; DEBUG snapshot
	; Author: Ped7g ; (C) 2022 ; license: MIT / GPL / Public Domain (pick whichever fits best for you)
	; assembled with: https://github.com/z00m128/sjasmplus/
	; based on https://github.com/MartinezTorres/z80_babel project, written as comparison
	;
	; This is second version of the sieve routine (asm-like asm, focusing on performance, hard-wired buffer, etc...)
	; For first C-like version check: https://gist.github.com/ped7g/c55bfa0d55ca13ce029549636cdd1de5
	;
	; sieve routine sieve_a_2 starts around line 110, below test harness

	; comment/uncomment the case you wan to debug:

	primes_size EQU 3100

	work_size EQU 12998 /* or 12999*/ ; enough for 2860 primes, ending with 25999 (same as 26000 buffer for sieve_c_1)
	do_case EQU caseCrc ; verification of actual primes list (2860 primes CRCed)

	; work_size EQU 1024 ; like 2048 for sieve_c_1 ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039)
	; do_case EQU caseZ80babel ; takes approx 46ms to finish

	; work_size EQU 430 ; hand tuned to fit single frame in CSpect emulator
	; do_case EQU caseTime
	; ; sieve_a_1 fits 1 frame in CSpect at work_size 500, primes: 0x005F (95), last prime 0x01F3 (499)
	; ; sieve_a_2 fits 1 frame in CSpect at work_size 430, primes: 0x0096 (150), last prime 0x035F (863)

	DEFINE ORG_ADR $8080 ; using only uncontended faster memory (can't fit the maximum size buffer for case below)

	; primes_size EQU 4 ; 8 bytes ahead of "work" buffer, using overlap of the two
	; work_size EQU 32766 ; check all 16b numbers up to 65535
	; do_case EQU caseZ80babel ; 6542 primes, last prime 65521, takes about 1.8sec to generate
	;
	; DEFINE ORG_ADR $7E00 ; for work_size 32766, to have enough room for work buffer

	;----------------------------------------------------------------------------------------------------------------------
	; test harness to debug the code in emulator

	OPT --syntax=abf
	DEVICE ZXSPECTRUM48,ORG_ADR-1
	ORG ORG_ADR
	test_start:
	jp do_case

	caseZ80babel:
	ld a,6
	out (254),a
	call sieve_a_2
	nop ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039)
	ld a,4
	out (254),a
	jr $

	caseCrc:
	ld a,5
	out (254),a
	call sieve_a_2
	nop ; expected result HL=0x0B2C (2860), primes[0] = ..., 0x658F (25999)

	; check result if it's as expected
	ld a,6
	out (254),a
	push hl
	add hl,hl
	ld bc,hl ; fake
	ld de,0 ; crc add:xor
	ld hl,primes
	.crc
	ld a,e
	xor (hl)
	ld e,a
	xor d
	add a,(hl)
	ld d,a
	inc hl
	dec bc
	ld a,b
	or c
	jr nz,.crc
	ld a,4
	ld hl,0x0DEB
	or a
	sbc hl,de
	jr z,.ok0
	ld a,2
	.ok0:
	pop hl
	ld de,0x0B2C
	or a
	sbc hl,de
	jr z,.ok1
	ld a,2
	.ok1:
	ld hl,25999
	ld de,(ix-2) ; fake
	or a
	sbc hl,de
	jr z,.ok2
	ld a,2
	.ok2:
	out (254),a
	jr $

	caseTime:
	ei
	halt
	ld a,5
	out (254),a
	call sieve_a_2
	xor a
	out (254),a
	jr caseTime

	;----------------------------------------------------------------------------------------------------------------------
	; 138B hand written asm sieve optimised for performance, taking many shortcuts, using odd-number-only sieve
	; "work" can overlap "primes" starting from primes+8 address (to maximise amount of primes calculated and returned)
	sieve_a_2:

	ASSERT high work && 2 <= work_size
	ASSERT (primes + 8 <= work) \|\| (work + work_size <= primes)
	ASSERT work + work_size <= 0xFFFF ; if work buffer would have last byte at 0xFFFF, the calculation of "i" breaks

	; logical : memory address offset into work buffer, storing only odd numbers sieve
	; work[3] : -1 (not stored in memory at all)
	; work[5] : +0
	; work[7] : +1
	; work[9] : +2

	; *p++ = 2;
	ld ix,primes
	xor a
	ld (ix+0),2
	ld (ix+1),a
	inc ix
	inc ix

	; clear the work buffer (full clear, first byte of buffer is marker for prime 5)
	; for (uint16_t i = 0; i < work_size; i++) work[i] = 0x00;
	ld hl,work
	ld de,work+1
	ld bc,work_size-1
	ld (hl),a
	ldir

	ld hl,work
	ld bc,work_size

	; mark prime 3 without searching by starting .mark_and_loop at work[5]

	.mark_and_loop:
	; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0)
	ex de,hl
	; i = 2*(hl - work + 1) + 1
	ld hl,1-work
	add hl,de ; always cf=1 when 0xFFFF is not part of work buffer
	adc hl,hl

	; *p++ = i;
	ld (ix+0),l
	ld (ix+1),h
	inc ix
	inc ix

	; IX = p, HL = i, DE = work[i+2], BC = bytes to check (BTW DE+BC = work.end(), but unable to use it)

	; mark work buffer at all multiplies of `i` (inner loop setting the sieve)
	; for (uint16_t j = i + i + i; j < work_size; j += i+i) work[j] = 1;
	push bc ; preserve bytes to check
	push de ; preserve work[i+2]
	ld bc,-(work+work_size)
	add hl,bc ; HL = i-(work+work_size) ; constant to check buffer overrun and add i+i to "j"
	jr c,.inner_setup_ov ; when this happens, it happens for all following primes -> faster final loop
	ex de,hl
	dec hl ; HL = work[i], DE = i-(work+work_size), BC = -(work+work_size)
	add hl,de ; j = i + i + i (adjusting work[j] address), plus test for work_size, must not overflow
	jr c,.inner_setup_ov ; when this happens, it happens for all following primes -> faster final loop
	.inner_loop:
	sbc hl,bc ; restore hl to hl+j
	ld (hl),h ; work[j] = non_zero_value
	add hl,de ; hl += i + test for work_size, must not overflow
	jp nc,.inner_loop
	pop hl ; restore work[i+2] pointer = next to check
	pop bc ; restore bytes to check

	; outer_loop ; for (uint16_t i = 3; i < work_size; i+=2)
	cpir
	jp pe,.mark_and_loop ; zf=1, pv=1
	; pv=0 case is UNREACHABLE ; should never happen, because sooner the inner loop setup will overflow

	; outer_loop phase 2, when marking primes is always impossible, (j >= work_size) == true because of large i
	.inner_setup_ov:
	pop hl ; restore work[i+2] pointer = next to check
	pop bc ; restore bytes to check
	jp .final_loop_entry

	.final_loop:
	; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0)
	ex de,hl
	; i = 2*(hl - work + 1) + 1
	ld hl,1-work
	add hl,de ; always cf=1 when 0xFFFF is not part of work buffer
	adc hl,hl
	; *p++ = i;
	ld (ix+0),l
	ld (ix+1),h
	inc ix
	inc ix
	ex de,hl ; restore HL to work[i+2] and continue with final loop
	.final_loop_entry:
	cpir
	jp pe,.final_loop ; zf=1, pv=1
	jr nz,.final_loop_done ; zf=0, pv=0 (last work[i] was not prime)

	; zf=1, pv=0 (match at last byte) - store last prime
	ld de,1-work
	add hl,de ; always cf=1 when 0xFFFF is not part of work buffer
	adc hl,hl
	; *p++ = i;
	ld (ix+0),l
	ld (ix+1),h
	inc ix
	inc ix

	.final_loop_done:
	; return p-primes;
	ld a,ixl
	sub low(primes)
	ld l,a
	ld a,ixh
	sbc a,high(primes)
	rra
	ld h,a
	rr l ; HL = number of primes
	ret

	DISPLAY "sieve_a_2 code size: ",/A,$-sieve_a_2

	;----------------------------------------------------------------------------------------------------------------------
	; "primes" and "work" buffers

	primes: ds 2*primes_size,$AA
	work: ds work_size,$BB

	db 'X' ; the work buffer can't use 0xFFFF address, so this is guardian byte making assembling fail

	SAVESNA "sieve2.sna", test_start ;; DEBUG snapshot