Skip to content

Instantly share code, notes, and snippets.

@ped7g
Last active February 25, 2022 04:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ped7g/1602d2e165850b55f6a52749d9811544 to your computer and use it in GitHub Desktop.
Save ped7g/1602d2e165850b55f6a52749d9811544 to your computer and use it in GitHub Desktop.
Z80 asm Erastothenes sieve to calculate prime numbers - part two, hunting for performance
; Author: Ped7g ; (C) 2022 ; license: MIT / GPL / Public Domain (pick whichever fits best for you)
; assembled with: https://github.com/z00m128/sjasmplus/
; based on https://github.com/MartinezTorres/z80_babel project, written as comparison
;
; This is second version of the sieve routine (asm-like asm, focusing on performance, hard-wired buffer, etc...)
; For first C-like version check: https://gist.github.com/ped7g/c55bfa0d55ca13ce029549636cdd1de5
;
; sieve routine sieve_a_2 starts around line 110, below test harness
; comment/uncomment the case you wan to debug:
primes_size EQU 3100
work_size EQU 12998 /* or 12999*/ ; enough for 2860 primes, ending with 25999 (same as 26000 buffer for sieve_c_1)
do_case EQU caseCrc ; verification of actual primes list (2860 primes CRCed)
; work_size EQU 1024 ; like 2048 for sieve_c_1 ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039)
; do_case EQU caseZ80babel ; takes approx 46ms to finish
; work_size EQU 430 ; hand tuned to fit single frame in CSpect emulator
; do_case EQU caseTime
; ; sieve_a_1 fits 1 frame in CSpect at work_size 500, primes: 0x005F (95), last prime 0x01F3 (499)
; ; sieve_a_2 fits 1 frame in CSpect at work_size 430, primes: 0x0096 (150), last prime 0x035F (863)
DEFINE ORG_ADR $8080 ; using only uncontended faster memory (can't fit the maximum size buffer for case below)
; primes_size EQU 4 ; 8 bytes ahead of "work" buffer, using overlap of the two
; work_size EQU 32766 ; check all 16b numbers up to 65535
; do_case EQU caseZ80babel ; 6542 primes, last prime 65521, takes about 1.8sec to generate
;
; DEFINE ORG_ADR $7E00 ; for work_size 32766, to have enough room for work buffer
;----------------------------------------------------------------------------------------------------------------------
; test harness to debug the code in emulator
OPT --syntax=abf
DEVICE ZXSPECTRUM48,ORG_ADR-1
ORG ORG_ADR
test_start:
jp do_case
caseZ80babel:
ld a,6
out (254),a
call sieve_a_2
nop ; expected result HL=0x0135 (309), primes[0] = ..., 0x7F7 (2039)
ld a,4
out (254),a
jr $
caseCrc:
ld a,5
out (254),a
call sieve_a_2
nop ; expected result HL=0x0B2C (2860), primes[0] = ..., 0x658F (25999)
; check result if it's as expected
ld a,6
out (254),a
push hl
add hl,hl
ld bc,hl ; fake
ld de,0 ; crc add:xor
ld hl,primes
.crc
ld a,e
xor (hl)
ld e,a
xor d
add a,(hl)
ld d,a
inc hl
dec bc
ld a,b
or c
jr nz,.crc
ld a,4
ld hl,0x0DEB
or a
sbc hl,de
jr z,.ok0
ld a,2
.ok0:
pop hl
ld de,0x0B2C
or a
sbc hl,de
jr z,.ok1
ld a,2
.ok1:
ld hl,25999
ld de,(ix-2) ; fake
or a
sbc hl,de
jr z,.ok2
ld a,2
.ok2:
out (254),a
jr $
caseTime:
ei
halt
ld a,5
out (254),a
call sieve_a_2
xor a
out (254),a
jr caseTime
;----------------------------------------------------------------------------------------------------------------------
; 138B hand written asm sieve optimised for performance, taking many shortcuts, using odd-number-only sieve
; "work" can overlap "primes" starting from primes+8 address (to maximise amount of primes calculated and returned)
sieve_a_2:
ASSERT high work && 2 <= work_size
ASSERT (primes + 8 <= work) || (work + work_size <= primes)
ASSERT work + work_size <= 0xFFFF ; if work buffer would have last byte at 0xFFFF, the calculation of "i" breaks
; logical : memory address offset into work buffer, storing only odd numbers sieve
; work[3] : -1 (not stored in memory at all)
; work[5] : +0
; work[7] : +1
; work[9] : +2
; *p++ = 2;
ld ix,primes
xor a
ld (ix+0),2
ld (ix+1),a
inc ix
inc ix
; clear the work buffer (full clear, first byte of buffer is marker for prime 5)
; for (uint16_t i = 0; i < work_size; i++) work[i] = 0x00;
ld hl,work
ld de,work+1
ld bc,work_size-1
ld (hl),a
ldir
ld hl,work
ld bc,work_size
; mark prime 3 without searching by starting .mark_and_loop at work[5]
.mark_and_loop:
; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0)
ex de,hl
; i = 2*(hl - work + 1) + 1
ld hl,1-work
add hl,de ; always cf=1 when 0xFFFF is not part of work buffer
adc hl,hl
; *p++ = i;
ld (ix+0),l
ld (ix+1),h
inc ix
inc ix
; IX = p, HL = i, DE = work[i+2], BC = bytes to check (BTW DE+BC = work.end(), but unable to use it)
; mark work buffer at all multiplies of `i` (inner loop setting the sieve)
; for (uint16_t j = i + i + i; j < work_size; j += i+i) work[j] = 1;
push bc ; preserve bytes to check
push de ; preserve work[i+2]
ld bc,-(work+work_size)
add hl,bc ; HL = i-(work+work_size) ; constant to check buffer overrun and add i+i to "j"
jr c,.inner_setup_ov ; when this happens, it happens for all following primes -> faster final loop
ex de,hl
dec hl ; HL = work[i], DE = i-(work+work_size), BC = -(work+work_size)
add hl,de ; j = i + i + i (adjusting work[j] address), plus test for work_size, must not overflow
jr c,.inner_setup_ov ; when this happens, it happens for all following primes -> faster final loop
.inner_loop:
sbc hl,bc ; restore hl to hl+j
ld (hl),h ; work[j] = non_zero_value
add hl,de ; hl += i + test for work_size, must not overflow
jp nc,.inner_loop
pop hl ; restore work[i+2] pointer = next to check
pop bc ; restore bytes to check
; outer_loop ; for (uint16_t i = 3; i < work_size; i+=2)
cpir
jp pe,.mark_and_loop ; zf=1, pv=1
; pv=0 case is UNREACHABLE ; should never happen, because sooner the inner loop setup will overflow
; outer_loop phase 2, when marking primes is always impossible, (j >= work_size) == true because of large i
.inner_setup_ov:
pop hl ; restore work[i+2] pointer = next to check
pop bc ; restore bytes to check
jp .final_loop_entry
.final_loop:
; work[i] signals prime ; IX = p, HL = work[i+2] = work + (i-5)/2 + 1, BC = bytes to check (>0)
ex de,hl
; i = 2*(hl - work + 1) + 1
ld hl,1-work
add hl,de ; always cf=1 when 0xFFFF is not part of work buffer
adc hl,hl
; *p++ = i;
ld (ix+0),l
ld (ix+1),h
inc ix
inc ix
ex de,hl ; restore HL to work[i+2] and continue with final loop
.final_loop_entry:
cpir
jp pe,.final_loop ; zf=1, pv=1
jr nz,.final_loop_done ; zf=0, pv=0 (last work[i] was not prime)
; zf=1, pv=0 (match at last byte) - store last prime
ld de,1-work
add hl,de ; always cf=1 when 0xFFFF is not part of work buffer
adc hl,hl
; *p++ = i;
ld (ix+0),l
ld (ix+1),h
inc ix
inc ix
.final_loop_done:
; return p-primes;
ld a,ixl
sub low(primes)
ld l,a
ld a,ixh
sbc a,high(primes)
rra
ld h,a
rr l ; HL = number of primes
ret
DISPLAY "sieve_a_2 code size: ",/A,$-sieve_a_2
;----------------------------------------------------------------------------------------------------------------------
; "primes" and "work" buffers
primes: ds 2*primes_size,$AA
work: ds work_size,$BB
db 'X' ; the work buffer can't use 0xFFFF address, so this is guardian byte making assembling fail
SAVESNA "sieve2.sna", test_start ;; DEBUG snapshot
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment