Last active
May 9, 2022 08:16
-
-
Save bvibber/95e8b78c51f4071517f978b63aba06be to your computer and use it in GitHub Desktop.
Quick sketch of a minimal sweep function for a 6502 retro computing GC implementation with fixed-size LISP-style nodes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; Note fixed addresses are illustrative, not real | |
.org $4000 | |
; Zero page variables | |
scratch1 = $80 | |
scratch2 = $82 | |
freelist = $84 ; ref to next free node | |
gc_color = $86 ; current representation of "white" for tricolor marking | |
heap_start = $8000 | |
heap_end = $c000 | |
gc_tag_mask = $c0 ; top 2 bits are for GC coloring + free marker | |
gc_tag_free = $00 ; this node is free. validity of freelist node marker is in a lower tag bit | |
gc_tag_check = $40 ; has been seen from a root or modified during incremental GC | |
gc_tag_zero = $80 ; represents either 'white' (reapable) or 'black' (live) sets | |
gc_tag_one = $c0 ; the opposite state | |
.struct Node | |
tag .byte | |
left .word | |
right .word | |
.endstruct | |
; Do a fast, minimal sweep after marking is complete. | |
; This cannot be interrupted, so must be as fast as | |
; possible. | |
; | |
; Current estimate is: | |
; - 12 cycles per node (+8 if freed) | |
; - 14 fixed cycles per unroll of 5 nodes (10x per page) | |
; - 2 + 41 = 43 fixed cycles per page | |
; - 61 + 6 = 67 fixed cycles | |
; - 64 * (12 * 51 + 14 * 10 + 41) + 67 | |
; - 64 * (612 + 140 + 41) + 67 | |
; - 64 * 793 + 40 | |
; - 50,792 total cycles | |
; | |
; for a 16 KiB heap (3,264 nodes over 64 pages) | |
; That's about 28.3 ms on an Atari at 1.79 MHz. | |
; | |
; Invariants: | |
; * there are no remaining nodes in 'check' set | |
; * all live, reachable nodes are in the 'black' set | |
; * all unreachable nodes are in the 'white' set | |
; * gc_color is set to current value for the 'white' set | |
; | |
sweep: | |
; use scratch1 as index | |
lda #heap_start.lobyte ; 2 cycles | |
sta scratch1 ; 3 cycles | |
lda #heap_start.hibyte ; 2 cycles | |
sta scratch1 + 1 ; 3 cycles | |
sta sweep_unroll1 + 2 ; 4 cycles | |
sta sweep_unroll2 + 2 ; 4 cycles | |
sta sweep_unroll3 + 2 ; 4 cycles | |
sta sweep_unroll4 + 2 ; 4 cycles | |
sta sweep_unroll5 + 2 ; 4 cycles | |
sta sweep_unroll255 + 2 ; 4 cycles | |
; save some cycles per node with self-modifying code | |
; (sickos.jpg) | |
lda gc_color ; 3 cycles | |
sta sweep_load_opcode1 + 2 ; 4 cycles | |
sta sweep_load_opcode2 + 2 ; 4 cycles | |
sta sweep_load_opcode3 + 2 ; 4 cycles | |
sta sweep_load_opcode4 + 2 ; 4 cycles | |
sta sweep_load_opcode5 + 2 ; 4 cycles | |
sta sweep_load_opcode255 + 2 ; 4 cycles | |
sweep_page: | |
; Each 256-byte pages contains 51 nodes (5 bytes each) | |
; One byte per page is wasted, not currently used | |
; But this makes it much easier to iterate :D | |
; | |
; y = 0 | |
ldy #00 ; 2 cycles | |
; check if the GC tag matches the reaping color | |
sweep_node: | |
; Unroll the loop 5x to reduce total cycles | |
sweep_unroll1: | |
ldy #00 ; 2 cycles | |
sweep_load_opcode1: | |
lda heap_start,y ; 4 cycles - self-modified | |
and #gc_tag_mask ; 2 cycles | |
sweep_cmp_opcode1: | |
cmp #ff ; 2 cycles - self-modified | |
bne sweep_unroll2 ; 2 cycles | |
; mark it as free with null free type. it'll be added to the freelist later | |
lda #0 ; 2 cycles | |
sta (scratch1),y ; 6 cycles | |
sweep_unroll2: | |
ldy #05 ; 2 cycles | |
sweep_load_opcode2: | |
lda heap_start,y ; 4 cycles - self-modified | |
and #gc_tag_mask ; 2 cycles | |
sweep_cmp_opcode2: | |
cmp #ff ; 2 cycles - self-modified | |
bne sweep_unroll3 ; 2 cycles | |
lda #0 ; 2 cycles | |
sta (scratch1),y ; 6 cycles | |
sweep_unroll3: | |
ldy #10 ; 2 cycles | |
sweep_load_opcode3: | |
lda heap_start,y ; 4 cycles - self-modified | |
and #gc_tag_mask ; 2 cycles | |
sweep_cmp_opcode3: | |
cmp #ff ; 2 cycles - self-modified | |
bne sweep_unroll4 ; 2 cycles | |
lda #0 ; 2 cycles | |
sta (scratch1),y ; 6 cycles | |
sweep_unroll4: | |
ldy #15 ; 2 cycles | |
sweep_load_opcode4: | |
lda heap_start,y ; 4 cycles - self-modified | |
and #gc_tag_mask ; 2 cycles | |
sweep_cmp_opcode4: | |
cmp #ff ; 2 cycles - self-modified | |
bne sweep_unroll5 ; 2 cycles | |
lda #0 ; 2 cycles | |
sta (scratch1),y ; 6 cycles | |
sweep_unroll5: | |
ldy #20 ; 2 cycles | |
sweep_load_opcode5: | |
lda heap_start,y ; 4 cycles - self-modified | |
and #gc_tag_mask ; 2 cycles | |
sweep_cmp_opcode5: | |
cmp #ff ; 2 cycles - self-modified | |
bne sweep_next5 ; 2 cycles | |
lda #0 ; 2 cycles | |
sta (scratch1),y ; 6 cycles | |
sweep_next: | |
; scratch1 += 25 | |
lda scratch1 ; 3 cycles | |
clc ; 2 cycles | |
adc #25 ; 2 cycles | |
sta scratch1 ; 3 cycles | |
; loop unroll end condition at y == 250 | |
cmp #250 ; 2 cycles | |
bne sweep_node ; 2 cycles | |
; one last node :D | |
sweep_unroll255: | |
ldy #0 ; 2 cycles | |
sweep_load_opcode255: | |
lda heap_start,y ; 4 cycles - self-modified | |
and #gc_tag_mask ; 2 cycles | |
sweep_cmp_opcode255: | |
cmp #ff ; 2 cycles - self-modified | |
bne sweep_next_page ; 2 cycles | |
lda #0 ; 2 cycles | |
sta (scratch1),y ; 6 cycles | |
sweep_next_page: | |
; scratch1 += 256 | |
lda #00 ; 2 cycles | |
sta scratch1 ; 3 cycles | |
inc scratch1 + 1 ; 5 cycles | |
lda scratch1 + 1 ; 3 cycles | |
sta sweep_load_opcode1 + 2 ; 4 cycles | |
sta sweep_load_opcode2 + 2 ; 4 cycles | |
sta sweep_load_opcode3 + 2 ; 4 cycles | |
sta sweep_load_opcode4 + 2 ; 4 cycles | |
sta sweep_load_opcode5 + 2 ; 4 cycles | |
sta sweep_load_opcode255 + 2 ; 4 cycles | |
; end condition; scratch1 != heap_end | |
cmp #heap_end.hibyte ; 2 cycles | |
bne sweep_page ; 2 cycles | |
rts ; 6 cycles | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment