-
-
Save lojic/63dff68a3b84d6ab8e28d5d7cea807fc to your computer and use it in GitHub Desktop.
Fast I/O & Bytes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Copyright (c) 2023 Brian Adkins | |
#lang racket | |
(require "./bytes-view.rkt") | |
(provide create-block-in | |
fill-buffer! | |
next-line!) | |
(define buffer-size (* 256 1024)) | |
(struct block-in (in ;; Input port | |
buffer ;; Buffer of bytes | |
line-beg ;; Index of beginning of current line inclusive | |
line-end ;; Index of end of current line exclusive | |
buffer-end) ;; Index of end of buffer exclusive | |
#:mutable | |
#:transparent) | |
(define (create-block-in [ in (current-input-port) ]) | |
(block-in in | |
(make-bytes buffer-size) | |
0 | |
0 | |
0)) | |
(define (fill-buffer! obj) | |
(let ([ line-beg (block-in-line-beg obj) ]) | |
(when (> line-beg 0) | |
(let ([ buffer (block-in-buffer obj) ] | |
[ buffer-end (block-in-buffer-end obj) ]) | |
;; Move existing bytes to beginning of buffer | |
(bytes-copy! buffer | |
0 | |
buffer | |
line-beg | |
buffer-end) | |
(set-block-in-line-beg! obj 0) | |
(set-block-in-line-end! obj (- (block-in-line-end obj) line-beg)) | |
(set-block-in-buffer-end! obj (- (block-in-buffer-end obj) line-beg))))) | |
;; Read more bytes into buffer | |
(let ([ num-read (read-bytes! (block-in-buffer obj) (block-in-in obj) (block-in-buffer-end obj) buffer-size) ]) | |
(when (not (eof-object? num-read)) | |
(set-block-in-buffer-end! obj (+ (block-in-buffer-end obj) | |
num-read))))) | |
(define (next-line! obj) | |
(let* ([ buffer (block-in-buffer obj) ] | |
[ line-beg (block-in-line-beg obj) ] | |
[ buffer-end (block-in-buffer-end obj) ] | |
[ idx (byte-index-of buffer | |
line-beg | |
(char->integer #\newline) | |
buffer-end) ]) | |
(cond [ idx | |
;; Newline found, return the line | |
(let* ([ line-end (add1 idx) ] | |
[ line (bytes-view buffer line-beg line-end) ]) | |
(set-block-in-line-beg! obj line-end) | |
(set-block-in-line-end! obj line-end) | |
line) ] | |
[ else | |
(fill-buffer! obj) | |
(let loop ([ buffer (block-in-buffer obj) ] | |
[ line-beg (block-in-line-beg obj) ] | |
[ line-end (block-in-line-end obj) ] | |
[ buffer-end (block-in-buffer-end obj) ]) | |
(if (< line-end buffer-end) | |
(let ([ c (integer->char (bytes-ref buffer line-end)) ]) | |
(set-block-in-line-end! obj (add1 line-end)) | |
(if (char=? c #\newline) | |
(let* ([ line-end (block-in-line-end obj) ] | |
[ line (bytes-view buffer line-beg line-end) ]) | |
(set-block-in-line-beg! obj line-end) | |
line) | |
(loop buffer | |
line-beg | |
(block-in-line-end obj) | |
buffer-end))) | |
(bytes-view buffer line-beg line-end))) ]))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Copyright (c) 2023 Brian Adkins | |
#lang racket | |
(require "./bytes-view.rkt") | |
(provide create-block-out | |
block-write-bytes! | |
flush-buffer!) | |
(define buffer-size (* 256 1024)) | |
(struct block-out (out | |
buffer | |
end) | |
#:mutable | |
#:transparent) | |
(define (create-block-out [ out (current-output-port) ]) | |
(block-out out | |
(make-bytes buffer-size) | |
0)) | |
(define (flush-buffer! obj) | |
(let ([ end (block-out-end obj) ]) | |
(when (> end 0) | |
(write-bytes (block-out-buffer obj) (block-out-out obj) 0 end) | |
(set-block-out-end! obj 0)))) | |
(define (block-write-bytes! obj view) | |
(let* ([ buffer (block-out-buffer obj) ] | |
[ view-beg (bytes-view-beg view) ] | |
[ view-end (bytes-view-end view) ] | |
[ view-len (- view-end view-beg) ]) | |
(when (> (+ (block-out-end obj) view-len) buffer-size) | |
(flush-buffer! obj)) | |
(let ([ end (block-out-end obj) ]) | |
(bytes-copy! buffer end (bytes-view-buffer view) view-beg view-end) | |
(set-block-out-end! obj (+ end view-len))))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Copyright (c) 2023 Brian Adkins | |
#lang racket | |
(require "./bytes-view.rkt") | |
(provide bytes-view-soundex) | |
(define empty-view (create-bytes-view #"")) | |
;; "-123-12_-22455-12623-1_2_2" ; original from wikipedia | |
;; ABCDEFGHIJKLMNOPQRSTUVWXYZ | |
(define codes #"-123-12_-22455-12623-1_2-2") ; treat y as a vowel to match Racket code for now | |
(define bA (char->integer #\A)) | |
(define bZ (char->integer #\Z)) | |
(define ba (char->integer #\a)) | |
(define bz (char->integer #\z)) | |
(define (get-code b) | |
(cond [ (and (>= b bA) (<= b bZ)) | |
(bytes-ref codes (- b bA)) ] | |
[ (and (>= b ba) (<= b bz)) | |
(bytes-ref codes (- b ba)) ] | |
[ else | |
0 ])) | |
(define (is-alpha b) | |
(or (and (>= b bA) (<= b bZ)) | |
(and (>= b ba) (<= b bz)))) | |
(define (bytes-view-soundex view) | |
(define len (bytes-view-length view)) | |
(define out (make-bytes 4)) | |
(bytes-copy! out 0 #"0000") | |
;; Set si to index of first alpha, if any | |
(define si (let loop ([ i 0 ]) | |
(if (or (>= i len) (is-alpha (bytes-view-ref view i))) | |
i | |
(loop (add1 i))))) | |
(cond [ (>= si len) | |
empty-view ] | |
[ else | |
(bytes-set! out 0 (bytes-view-ref view si)) | |
(let loop ([ si (add1 si) ][ i 1 ][ prev (get-code (bytes-ref out 0)) ]) | |
(if (or (>= si len) (>= i 4)) | |
(create-bytes-view out) | |
(let ([ code (get-code (bytes-view-ref view si)) ]) | |
(cond [ (or (= code 0) (= code prev)) | |
(loop (add1 si) i prev) ] | |
[ (= code (char->integer #\-)) | |
; vowel separator | |
(loop (add1 si) i 0) ] | |
[ (= code (char->integer #\_)) | |
; h, w, y separator do nothing | |
(loop (add1 si) i prev) ] | |
[ else | |
(bytes-set! out i code) | |
(loop (add1 si) (add1 i) code) ])))) ])) | |
(module+ test | |
(require rackunit) | |
;; get-code | |
(check-equal? (get-code (char->integer #\L)) (char->integer #\4)) | |
;; bytes-view-soundex | |
(for ([ pair (in-list '((#"GLENWYN" . #"G455") | |
(#"-CHARLES" . #"C642") | |
(#"Tymczak" . #"T522") | |
(#"Rupert" . #"R163") | |
(#"Robert" . #"R163") | |
(#"Ashcraft" . #"A261") | |
(#"Burroughs" . #"B620") | |
(#"Burrows" . #"B620") | |
(#"Ekzampul" . #"E251") | |
(#"Ellery" . #"E460") | |
(#"Euler" . #"E460") | |
(#"Example" . #"E251") | |
(#"Gauss" . #"G200") | |
(#"Ghosh" . #"G200") | |
(#"Gutierrez" . #"G362") | |
(#"Heilbronn" . #"H416") | |
(#"Hilbert" . #"H416") | |
(#"Jackson" . #"J250") | |
(#"Kant" . #"K530") | |
(#"Knuth" . #"K530") | |
(#"Ladd" . #"L300") | |
(#"Lee" . #"L000") | |
(#"Lissajous" . #"L222") | |
(#"Lloyd" . #"L300") | |
(#"Lukasiewicz" . #"L222") | |
(#"O'Hara" . #"O600") | |
(#"Pfister" . #"P236") | |
(#"Soundex" . #"S532") | |
(#"Sownteks" . #"S532") | |
(#"Tymczak" . #"T522") | |
(#"VanDeusen" . #"V532") | |
(#"Washington" . #"W252") | |
(#"Wheaton" . #"W350"))) ]) | |
(let* ([ bstr (car pair) ] | |
[ view (bytes-view-soundex (create-bytes-view bstr)) ]) | |
(check-equal? (bytes-view->bytes view) (cdr pair)))) | |
) | |
#;(define (compute-soundex view) | |
(~> (bytes-view->bytes view) | |
(bytes->string/latin-1 _) | |
(soundex-encode _) | |
(string->bytes/latin-1 _) | |
(create-bytes-view _))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Copyright (c) 2023 Brian Adkins | |
#lang racket | |
;; Byte String View | |
;; Enhances performance by reducing allocation and copying of bytes. | |
(provide byte-index-of | |
bytes-view->bytes | |
bytes-view-index-of | |
bytes-view-length | |
bytes-view-ref | |
bytes-view-starts-with | |
bytes-view-subview | |
create-bytes-view | |
empty-bytes-view? | |
split-bytes-view | |
trim-bytes-view! | |
(struct-out bytes-view)) | |
(struct bytes-view (buffer | |
beg | |
end) | |
#:mutable | |
#:transparent) | |
(define (byte-index-of bstr idx b end) | |
(let loop ([ idx idx ]) | |
(if (< idx end) | |
(if (= b (bytes-ref bstr idx)) | |
idx | |
(loop (add1 idx))) | |
#f))) | |
;; Return index of b (relative to beginning of view, or #f if not found | |
(define (bytes-view-index-of obj b) | |
(let ([ pos (byte-index-of (bytes-view-buffer obj) | |
(bytes-view-beg obj) | |
b | |
(bytes-view-end obj)) ]) | |
(if pos | |
(- pos (bytes-view-beg obj)) | |
#f))) | |
(define (bytes-view->bytes obj) | |
(subbytes (bytes-view-buffer obj) | |
(bytes-view-beg obj) | |
(bytes-view-end obj))) | |
(define (bytes-view-length obj) | |
(- (bytes-view-end obj) (bytes-view-beg obj))) | |
(define (bytes-view-ref obj pos) | |
(bytes-ref (bytes-view-buffer obj) (+ pos (bytes-view-beg obj)))) | |
(define (bytes-view-starts-with obj bstr) | |
(let ([ buf-len (bytes-view-length obj) ] | |
[ bstr-len (bytes-length bstr) ]) | |
(let loop ([ idx 0 ]) | |
(cond [ (>= idx bstr-len) #t ] ; Ran out of bstr | |
[ (>= idx buf-len) #f ] ; Ran out of view | |
[ (= (bytes-view-ref obj idx) (bytes-ref bstr idx)) | |
(loop (add1 idx)) ] | |
[ else #f ])))) | |
(define (bytes-view-subview obj beg end) | |
(let ([ view-beg (bytes-view-beg obj) ]) | |
(create-bytes-view (bytes-view-buffer obj) | |
(+ beg view-beg) | |
(+ end view-beg)))) | |
(define (create-bytes-view bstr [ beg #f ] [ end #f ]) | |
(let ([ beg (if beg beg 0) ] | |
[ end (if end end (bytes-length bstr)) ]) | |
(bytes-view bstr beg end))) | |
(define (empty-bytes-view? obj) | |
(= 0 (bytes-view-length obj))) | |
(define (split-bytes-view obj delim-char) | |
(let ([ delim (char->integer delim-char) ] | |
[ len (bytes-view-length obj) ]) | |
(if (< len 1) | |
'() | |
(let loop ([ first 0 ] | |
[ tok-end 0 ] | |
[ si 0 ] | |
[ result '() ]) | |
(if (< si len) | |
(if (= delim (bytes-view-ref obj si)) | |
(let ([ si (add1 si) ]) | |
(if (> tok-end 0) | |
(let ([ token (bytes-view-subview obj first tok-end) ]) | |
(trim-bytes-view! token) | |
(loop si si si (cons token result))) | |
(loop si si si (cons (create-bytes-view (bytes-view-buffer obj) 0 0) result)))) | |
(loop first (add1 tok-end) (add1 si) result)) | |
(if (> tok-end 0) | |
;; Last token | |
(let ([ token (bytes-view-subview obj first tok-end) ]) | |
(trim-bytes-view! token) | |
(reverse (cons token result))) | |
(if (= delim (bytes-view-ref obj (sub1 len))) | |
;; Handle trailing delim | |
(reverse (cons (create-bytes-view (bytes-view-buffer obj) 0 0) result)) | |
(reverse result)))))))) | |
(define (trim-bytes-view! obj) | |
(define buffer (bytes-view-buffer obj)) | |
(let loop ([ beg (bytes-view-beg obj) ] | |
[ end (bytes-view-end obj) ]) | |
(cond [ (<= end beg) | |
;; In case end < beg, use beg for both to indicate empty | |
(set-bytes-view-beg! obj beg) | |
(set-bytes-view-end! obj beg) ] | |
[ else | |
(let* ([ b1 (bytes-ref buffer beg) ] | |
[ b2 (bytes-ref buffer (sub1 end)) ] | |
[ beg2 (if (char-whitespace? (integer->char b1)) | |
(add1 beg) | |
beg) ] | |
[ end2 (if (char-whitespace? (integer->char b2)) | |
(sub1 end) | |
end) ]) | |
(cond [ (and (= beg beg2) (= end end2)) | |
;; No change, we're done | |
(set-bytes-view-beg! obj beg2) | |
(set-bytes-view-end! obj end2) ] | |
[ else | |
(loop beg2 end2) ])) ]))) | |
(module+ test | |
(require rackunit) | |
;; bytes-view | |
(let ([ obj (bytes-view #"Hello" 1 4) ]) | |
(check-false (empty-bytes-view? obj)) | |
(let ([ bstr (subbytes (bytes-view-buffer obj) (bytes-view-beg obj) (bytes-view-end obj)) ]) | |
(check-equal? bstr #"ell"))) | |
;; byte-index-of | |
(let* ([ bstr #"Hello, world" ] | |
[ idx (byte-index-of bstr 2 (char->integer #\,) (bytes-length bstr)) ]) | |
(check-equal? idx 5)) | |
;; bytes-view-index-of | |
;; 012345678901 | |
(let* ([ bstr #"Hello, world" ] | |
;; view is #"llo, wo" | |
[ view (create-bytes-view bstr 2 9) ]) | |
(check-equal? (bytes-view-index-of view (char->integer #\,)) 3) | |
(check-equal? (bytes-view-index-of view (char->integer #\w)) 5) | |
(check-false (bytes-view-index-of view (char->integer #\d)))) | |
;; bytes-view-starts-with | |
(let* ([ bstr #"abcdefg" ] | |
[ view (create-bytes-view bstr 1) ]) | |
(check-not-false (bytes-view-starts-with view #"b")) | |
(check-not-false (bytes-view-starts-with view #"bcd")) | |
(check-false (bytes-view-starts-with view #"bcdefgh")) | |
) | |
;; split-bytes-view | |
(let* ([ bstr #" Smith, John,G.\n" ] | |
[ view (create-bytes-view bstr) ] | |
[ lst (split-bytes-view view #\, ) ]) | |
(check-equal? (length lst) 3) | |
(check-equal? (bytes-view->bytes (first lst)) #"Smith") | |
(check-equal? (bytes-view->bytes (second lst)) #"John") | |
(check-equal? (bytes-view->bytes (third lst)) #"G.")) | |
(let* ([ bstr #" ,Smith, John,G.\n," ] | |
[ view (create-bytes-view bstr) ] | |
[ lst (split-bytes-view view #\, ) ]) | |
(check-equal? (length lst) 5) | |
(check-equal? (bytes-view->bytes (first lst)) #"") | |
(check-equal? (bytes-view->bytes (second lst)) #"Smith") | |
(check-equal? (bytes-view->bytes (third lst)) #"John") | |
(check-equal? (bytes-view->bytes (fourth lst)) #"G.") | |
(check-equal? (bytes-view->bytes (fifth lst)) #"")) | |
;; trim-bytes-view! | |
(for ([ pair (in-list '((#"" . #"") | |
(#" " . #"") | |
(#"A" . #"A") | |
(#"\tA" . #"A") | |
(#"\t A B\n" . #"A B") | |
)) ]) | |
(let* ([ view (create-bytes-view (car pair)) ] | |
[ bstr (cdr pair) ]) | |
(trim-bytes-view! view) | |
(check-equal? (bytes-view->bytes view) bstr))) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Copyright (c) 2023 Brian Adkins | |
#lang racket | |
(require "./block-input.rkt" | |
"./block-output.rkt" | |
"./bytes-view.rkt") | |
(module+ main | |
(define in (create-block-in)) | |
(define out (create-block-out)) | |
(let loop ([ line (next-line! in) ]) | |
(when (not (empty-bytes-view? line)) | |
(block-write-bytes! out line) | |
(loop (next-line! in)))) | |
(flush-buffer! out)) | |
#;(module+ main | |
(define in (current-input-port)) | |
(define out (current-output-port)) | |
(for ([ line (in-bytes-lines in) ]) | |
(write-bytes line out) | |
(newline out))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment