Skip to content

Instantly share code, notes, and snippets.

@shhyou
Created March 27, 2023 03:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shhyou/484b4202bcfcec70b85e7227522bb111 to your computer and use it in GitHub Desktop.
Save shhyou/484b4202bcfcec70b85e7227522bb111 to your computer and use it in GitHub Desktop.
An O(n lg^2 n) implementation of the Burrows-Wheeler transform using the doubling algorithm and an O(n|S|) implementation of the inverse transform using modified radix sort.
#lang racket/base
(require racket/list
racket/vector
racket/bytes
racket/match)
(provide
;; plain implementation of the Burrows-Wheeler transformation
(struct-out bwencode)
plain-bwt-encode
plain-bwt-decode
;; faster implementation
bwt-encode
bwt-decode
)
(module+ test
(require rackunit))
(struct bwencode (sorted-index bytes) #:transparent)
(module+ test
#|
banana 0..5
ananab 1..5 0..0
nanaba 2..5 0..1
anaban 2..5 0..2
nabana 2..5 0..3
abanan 5..5 0..4
=sort=>
0 abana|n
1 anaba|n
2 anana|b
* 3 banan|a
4 naban|a
5 nanab|a
|#
(check-equal?
(plain-bwt-encode #"banana")
(bwencode 3 #"nnbaaa"))
;; https://en.wikipedia.org/wiki/Burrows-Wheeler_transform
(check-equal?
(plain-bwt-encode #"^BANANA|")
(bwencode 6 #"BNN^AA|A"))
(check-equal?
(plain-bwt-encode #"R")
(bwencode 0 #"R"))
(check-equal?
(plain-bwt-encode #"NU")
(bwencode 0 #"UN"))
(check-equal?
(plain-bwt-encode #"UN")
(bwencode 1 #"UN"))
(define concolic-abs
(bytes-append
#"Higher-order functions have become a staple of modern programming languages. "
#"However, such values stymie concolic testers, as the SMT solvers at their hearts "
#"are inherently first-order. This paper lays a formal foundations for concolic "
#"testing higher-order functional programs. Three ideas enable our results: "
#"(i) our tester considers only program inputs in a canonical form; (ii) it "
#"collects novel constraints from the evaluation of the canonical inputs to "
#"search the space of inputs with partial help from an SMT solver and "
#"(iii) it collects constraints from canonical inputs even when they are "
#"arguments to concretized calls. We prove that (i) concolic evaluation is sound "
#"with respect to concrete evaluation; (ii) modulo concretization and SMT solver "
#"incompleteness, the search for a counter-example succeeds if a user program has "
#"a bug and (iii) this search amounts to directed evolution of inputs targeting "
#"hard-to-reach corners of the program."))
(define concolic-rot
(bytes-append
#"t:;;dd.nde...snrsefhmrgnyse,seadamett)erooorslhaosceesdyhslalpssrrgmsrlgess"
#"relmflfn))grf)sneensse)shryenlehrseoTTTsease,srccesfmf,htn)tsssahnsd i"
#"iiiiisrsrrrtodsrssmsmn. SSS MMM npeurrcmiccncvvvvrrrr xr lcccp"
#"teeeh t pehe h zuuudhl a iiiiii uacrarrunnn en nnneenneennne"
#"enrnirrroi eohmrvhihtercllhWhhlvrssshdbprllzterchvhhv trmvtddvpsvthhdhdvndt"
#"ugrntttrrlgrr w -hiooooo unnnariiooooonrcccttcc t tttttt t wg"
#"gnt TtT(iii(itlllnnn sm hHi((i(( ttm aattttttttedfhh wwtteaaaaaa p"
#"bpllpoooooaueluaaaaooootnooaaoarrouyma aaoaaiooooreeoeooooiooouuaaauureiii"
#"aiooooaaa iiiiioooooueiieutlttttmm rrrrrcccccvsssrrrcciiiiiiccccccnnn i"
#"icccff---ffcsfcm rnHls saamm nnnnnoeoeeeieeeuueeeeettgggggaaaaoooaa-"
#"ihe cccaaooeopppppfffpeeeeiaaaytrtatnttndttrriiietattttrsemlt un eer"
#" eeenn eaiiacss ecesns ii resaauaacaceen -ssrcunnucunnuulsg"
#"lllsslbdsgffoooooo lpppppeee aooelleleo eelltaii"))
(check-equal?
(plain-bwt-encode concolic-abs)
(bwencode 172 concolic-rot))
(define letters-rep
(bytes-append* (make-list 1000 #"abcdefghijklmnopqrstuvwxyz")))
(define letters-sorted
(bytes-append*
(for/list ([idx (in-range 26)])
(make-bytes 1000 (+ 97 (modulo (+ idx 25) 26))))))
(check-equal?
(time (displayln '(plain-bwt-encode letters-rep))
(plain-bwt-encode letters-rep))
(bwencode 0 letters-sorted)))
(define (plain-bwt-encode bs)
(define len (bytes-length bs))
(define rotated-bss
(for/list ([i (in-range len)])
(cons i (bytes-append (subbytes bs i) (subbytes bs 0 i)))))
(define sorted-bss
(sort rotated-bss bytes<? #:key cdr))
(define bs-last-col (make-bytes len))
(for ([i (in-range len)]
[idx-bs (in-list sorted-bss)])
(bytes-set! bs-last-col i (bytes-ref (cdr idx-bs) (sub1 len))))
(bwencode (index-where sorted-bss (λ (idx-bs) (zero? (car idx-bs))))
(bytes->immutable-bytes bs-last-col)))
(module+ test
(check-equal?
(bwt-encode/substr-exp-sort #"banana")
(bwencode 3 #"nnbaaa"))
(check-equal?
(bwt-encode/substr-exp-sort #"^BANANA|")
(bwencode 6 #"BNN^AA|A"))
(check-equal?
(bwt-encode/substr-exp-sort #"R")
(bwencode 0 #"R"))
(check-equal?
(bwt-encode/substr-exp-sort #"NU")
(bwencode 0 #"UN"))
(check-equal?
(bwt-encode/substr-exp-sort #"UN")
(bwencode 1 #"UN"))
(check-equal?
(bwt-encode/substr-exp-sort concolic-abs)
(bwencode 172 concolic-rot))
(check-equal?
(time (displayln '(bwt-encode/substr-exp-sort letters-rep))
(bwt-encode/substr-exp-sort letters-rep))
(bwencode 0 letters-sorted)))
(struct rotslice (start len [rank #:mutable]) #:transparent)
(define (bwt-encode/substr-exp-sort bs)
(define len (bytes-length bs))
(define (build-from-half-subsubstrs sublen/2 half-substrs-tbl slice)
(values (vector-ref half-substrs-tbl (rotslice-start slice))
(vector-ref half-substrs-tbl (modulo (+ (rotslice-start slice) sublen/2) len))))
(define (cmp-1byte sl1 sl2)
(< (bytes-ref bs (rotslice-start sl1)) (bytes-ref bs (rotslice-start sl2))))
(define substrs1
(build-vector len (λ (i) (rotslice i 1 #f))))
(fill-slice-ranks! (sort (vector->list substrs1) cmp-1byte) cmp-1byte)
(define substrs-table
(cond
[(<= (bytes-length bs) 1)
(vector substrs1)]
[else
(list->vector
(cons substrs1
(let loop ([sublen/2 1] [substrs/2 substrs1])
(define new-substrs
(build-vector len (λ (i) (rotslice i (+ sublen/2 sublen/2) #f))))
(define (cmp-sublen sl1 sl2)
(define-values (subsl1-1 subsl1-2)
(build-from-half-subsubstrs sublen/2 substrs/2 sl1))
(define-values (subsl2-1 subsl2-2)
(build-from-half-subsubstrs sublen/2 substrs/2 sl2))
(if (not (= (rotslice-rank subsl1-1) (rotslice-rank subsl2-1)))
(< (rotslice-rank subsl1-1) (rotslice-rank subsl2-1))
(< (rotslice-rank subsl1-2) (rotslice-rank subsl2-2))))
(define largest-rank
(fill-slice-ranks! (sort (vector->list new-substrs) cmp-sublen) cmp-sublen))
(cons new-substrs
(cond [(= (+ 1 largest-rank) len) '()]
[(<= (* sublen/2 4) len) (loop (* sublen/2 2) new-substrs)]
[else '()])))))]))
(define max-sublen
(arithmetic-shift 1 (- (vector-length substrs-table) 1)))
(define (rots-cmp rot1 rot2)
(let cmploop ([pos 0] [i (sub1 (vector-length substrs-table))] [sublen max-sublen])
(cond
[(= pos len) #f]
[(<= (+ pos sublen) len)
(define substrs (vector-ref substrs-table i))
(define substr1 (vector-ref substrs (modulo (+ (rotslice-start rot1) pos) len)))
(define substr2 (vector-ref substrs (modulo (+ (rotslice-start rot2) pos) len)))
(if (not (= (rotslice-rank substr1) (rotslice-rank substr2)))
(< (rotslice-rank substr1) (rotslice-rank substr2))
(cmploop (+ pos sublen) (sub1 i) (arithmetic-shift sublen -1)))]
[else
(cmploop pos (sub1 i) (arithmetic-shift sublen -1))])))
(define sorted-rots
(sort (build-list len (λ (i) (rotslice i len #f)))
rots-cmp))
;; Compute the last column
(define bs-last-col (make-bytes len 0))
(for ([i (in-range len)]
[slice (in-list sorted-rots)])
(bytes-set! bs-last-col i (bytes-ref bs (modulo (+ len -1 (rotslice-start slice)) len))))
(bwencode (index-where sorted-rots (λ (slice) (zero? (rotslice-start slice))))
(bytes->immutable-bytes bs-last-col)))
(define (fill-slice-ranks! slices slice<)
(set-rotslice-rank! (car slices) 0)
(for/last ([curr-slice (cdr slices)]
[prev-slice (in-list slices)])
(set-rotslice-rank! curr-slice (if (slice< prev-slice curr-slice)
(+ 1 (rotslice-rank prev-slice))
(rotslice-rank prev-slice)))
(rotslice-rank curr-slice)))
(define bwt-encode bwt-encode/substr-exp-sort)
(module+ test
(check-equal? (plain-bwt-decode (bwencode 3 #"nnbaaa")) #"banana")
(check-equal? (plain-bwt-decode (bwencode 6 #"BNN^AA|A")) #"^BANANA|")
(check-equal? (plain-bwt-decode (bwencode 0 #"R")) #"R")
(check-equal? (plain-bwt-decode (bwencode 0 #"UN")) #"NU")
(check-equal? (plain-bwt-decode (bwencode 1 #"UN")) #"UN")
(let ([b #"the quick brown fox jumps over the lazy dog"])
(check-equal? (plain-bwt-decode (plain-bwt-encode b)) b)
(check-equal? (plain-bwt-decode (plain-bwt-encode b) #:builtin-sort? #t) b))
(check-equal? (plain-bwt-decode (bwencode 172 concolic-rot)) concolic-abs)
(check-equal? (time (displayln '(plain-bwt-decode (bwencode 0 letters-sorted)))
(plain-bwt-decode (bwencode 0 letters-sorted))) letters-rep))
(define (plain-bwt-decode a-bwt #:builtin-sort? [builtin-sort? #f])
(if builtin-sort?
(plain-bwt-decode/builtin-sort a-bwt)
(plain-bwt-decode/mat-radix-sort a-bwt)))
(define (plain-bwt-decode/builtin-sort a-bwt)
(define len (bytes-length (bwencode-bytes a-bwt)))
(define bs-mat
(for/fold ([bs-mat (build-list len (λ (ix) (make-bytes len)))])
([end-ix (in-range len 0 -1)])
(for ([bs (in-list bs-mat)]
[b (in-bytes (bwencode-bytes a-bwt))])
(bytes-set! bs (sub1 end-ix) b))
(sort bs-mat bytes<?)))
(list-ref bs-mat (bwencode-sorted-index a-bwt)))
(define (plain-bwt-decode/mat-radix-sort a-bwt)
(define len (bytes-length (bwencode-bytes a-bwt)))
(define bytes-idx/immutable
(vector->immutable-vector
(sorted-bytes-start-index (bwencode-bytes a-bwt))))
(define bs-mat
(for/fold ([bs-mat (build-vector len (λ (ix) (make-bytes len)))])
([end-ix (in-range len 0 -1)])
(for ([bs (in-vector bs-mat)]
[b (in-bytes (bwencode-bytes a-bwt))])
(bytes-set! bs (sub1 end-ix) b))
;; one iteration of radix sort
(define bytes-idx (vector-copy bytes-idx/immutable))
(define new-bs-mat (make-vector len #f))
(for ([bs (in-vector bs-mat)]
[b (in-bytes (bwencode-bytes a-bwt))])
(vector-set! new-bs-mat (vector-ref bytes-idx b) bs)
(vector-set! bytes-idx b (add1 (vector-ref bytes-idx b))))
new-bs-mat))
(vector-ref bs-mat (bwencode-sorted-index a-bwt)))
(module+ test
(check-equal? (bwt-decode (bwencode 3 #"nnbaaa")) #"banana")
(check-equal? (bwt-decode (bwencode 6 #"BNN^AA|A")) #"^BANANA|")
(check-equal? (bwt-decode (bwencode 0 #"R")) #"R")
(check-equal? (bwt-decode (bwencode 0 #"UN")) #"NU")
(check-equal? (bwt-decode (bwencode 1 #"UN")) #"UN")
(let ([b #"the quick brown fox jumps over the lazy dog"])
(check-equal? (bwt-decode (plain-bwt-encode b)) b)
(check-equal? (bwt-decode (bwt-encode b)) b))
(check-equal? (bwt-decode (bwencode 172 concolic-rot)) concolic-abs)
(check-equal? (time (displayln '(bwt-decode (bwencode 0 letters-sorted)))
(bwt-decode (bwencode 0 letters-sorted))) letters-rep))
(define (bwt-decode a-bwt)
(match-define (struct* bwencode ([bytes input-bytes]
[sorted-index sorted-ix]))
a-bwt)
(define bytes-idx (sorted-bytes-start-index input-bytes))
(define len (bytes-length input-bytes))
;; `permute-index` stores how sorting some bytes `bs` permutates the bytes such that
;; (forall i. bs'[permute-index[i]] = bs[i]) <=> bs' = sort(bs), or equivalently
;; forall i. sort(bs)[i] = bs[permute-index^{-1}[i]]
(define permute-index
(for/vector #:length len ([b (in-bytes input-bytes)])
(define ix (vector-ref bytes-idx b))
(vector-set! bytes-idx b (add1 (vector-ref bytes-idx b)))
ix))
(define output-bytes (make-bytes len))
(for/fold ([sorted-ix sorted-ix])
([i (in-range len)])
(bytes-set! output-bytes (- len i 1) (bytes-ref input-bytes sorted-ix))
(vector-ref permute-index sorted-ix))
output-bytes)
(define (sorted-bytes-start-index bs
#:output-bytes-count! [bytes-count (make-vector 256 0)]
#:output-bytes-index! [bytes-idx (make-vector 256 0)])
(for ([b (in-bytes bs)])
(vector-set! bytes-count b (add1 (vector-ref bytes-count b))))
(for/fold ([offset 0])
([i (in-range 0 256)])
(vector-set! bytes-idx i offset)
(+ offset (vector-ref bytes-count i)))
bytes-idx)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment