Skip to content

Instantly share code, notes, and snippets.

@lojic
Last active January 19, 2016 06:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lojic/413f972bcaf1a6b156e2 to your computer and use it in GitHub Desktop.
Save lojic/413f972bcaf1a6b156e2 to your computer and use it in GitHub Desktop.
Sequential version of file parser (3 files)
#lang racket
(define space (char->integer #\space))
(define zero (char->integer #\0))
(define nine (char->integer #\9))
(define max-ascii 127)
;; Indicate wither the byte is in [ 0 .. 9 ]
(define (byte-numeric? b)
(and (>= b zero)
(<= b nine)))
;; Return the first byte of a byte string
(define (bytes-car bytes)
(bytes-ref bytes 0))
;; Return all but the first byte of a byte string
(define (bytes-cdr bytes)
(subbytes bytes 1))
;; Return a byte string with each occurrence removed
;; val: byte?
(define (bytes-delete bytes val)
(define len (bytes-length bytes))
(define (has-val? i)
(cond [(= i len) #f]
[(= val (bytes-ref bytes i)) #t]
[else (has-val? (+ i 1))]))
(if (> len 0)
(cond [(has-val? 0)
(define result (make-bytes len))
(define dst 0)
(for ([b (in-bytes bytes)])
(cond [(= b val) '()]
[else (bytes-set! result dst b)
(set! dst (+ dst 1))]))
(subbytes result 0 dst)]
[else bytes])
bytes))
;; Return a byte string with each byte for which pred returns true removed
;; pred: procedure?
(define (bytes-delete-pred bytes pred)
(define len (bytes-length bytes))
(define (has-val? i)
(cond [(= i len) #f]
[(pred (bytes-ref bytes i)) #t]
[else (has-val? (+ i 1))]))
(if (> len 0)
(cond [(has-val? 0)
(define result (make-bytes len))
(define dst 0)
(for ([b (in-bytes bytes)])
(cond [(pred b) '()]
[else (bytes-set! result dst b)
(set! dst (+ dst 1))]))
(subbytes result 0 dst)]
[else bytes])
bytes))
;; Return a byte string with each occurrence of "from" replaced by "to"
;; from: byte?
;; to: byte?
(define (bytes-replace bytes from to)
(define len (bytes-length bytes))
(define (has-from? i)
(cond [(= i len) #f]
[(= from (bytes-ref bytes i)) #t]
[else (has-from? (+ i 1))]))
(cond [(has-from? 0)
(define result (make-bytes len))
(for ([i (in-range len)]
[b (in-bytes bytes)])
(if (= b from)
(bytes-set! result i to)
(bytes-set! result i b)))
result]
[else bytes]))
;; Split a byte string into a list of byte strings based on the specified sep byte.
;; NOTE: Only handles case of a single separator byte
(define (bytes-split bytes sep)
(define len (bytes-length bytes))
(define (helper beg end result)
(cond
; Begin is beyond end of byte string
[(= beg len)
(if (= sep (bytes-ref bytes (- len 1)))
(reverse (cons #"" result))
(reverse result))]
; End is beyond end of byte string
[(= end len) (reverse (cons (subbytes bytes beg end) result))]
; End is at sep
[(= sep (bytes-ref bytes end))
(let ([i (+ end 1)])
(helper i i (cons (subbytes bytes beg end) result)))]
; Increment end
[else (helper beg (+ end 1) result)]))
(if (> len 0)
(helper 0 0 '())
(list bytes)))
;; Returns a new string where runs of the same character are replaced
;; by a single character.
;; (string-squeeze "abbaacdccee") -> "abacdce"
(define (bytes-squeeze bytes)
(define result (make-bytes (bytes-length bytes)))
(define last null)
(define i -1)
(for ([c (in-bytes bytes)])
(when (not (eqv? c last))
(set! i (+ i 1))
(set! last c)
(bytes-set! result i c)))
(subbytes result 0 (+ i 1)))
;; Trims leading/trailing bytes that are <= <space> or > <max ascii>
(define (bytes-trim bytes)
(define len (bytes-length bytes))
(let left ([i 0])
(if (< i len)
(let ([left-byte (bytes-ref bytes i)])
(if (or (<= left-byte space)
(> left-byte max-ascii))
(left (+ i 1))
(let right ([j (- len 1)])
(if (>= j i)
(let ([right-byte (bytes-ref bytes j)])
(if (or (<= right-byte space)
(> right-byte max-ascii))
(right (- j 1))
(subbytes bytes i (+ j 1))))
#""))))
#"")))
;; Indicate whether the argument is a byte string and is not empty
(define (non-empty-bytes? bytes)
(and (bytes? bytes)
(> (bytes-length bytes) 0)))
(provide byte-numeric? bytes-car bytes-cdr bytes-delete bytes-delete-pred bytes-replace bytes-split bytes-squeeze bytes-trim max-ascii non-empty-bytes? space)
#lang racket
(require "./byte-string-support.rkt")
(require "./soundex.rkt")
;(require profile)
(define backslash (char->integer #\\))
(define comma (char->integer #\,))
(define zero (char->integer #\0))
(define (parse-file ipath epath apath)
(call-with-input-file ipath
(λ (i-in)
(call-with-output-file epath
(λ (e-out)
(call-with-output-file apath
(λ (a-out)
(parse-records i-in e-out a-out))
#:exists 'replace))
#:exists 'replace))))
(define (parse-records i-in e-out a-out)
(define num-deletes 0)
(define num-cases 0)
(define num-aliases 0)
;; Extract header
(fprintf e-out
"~a\n"
(string-append
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,"
"first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,"
"addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,"
"last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,"
"extract_date,personal_identifier,case_disposed,court_type) from stdin;"))
;; SubjectAlias header
(fprintf a-out
"~a\n"
(string-append
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,"
"first_name_soundex,middle_name,suffix) from stdin;"))
;; Main loop
(for ([line-no (in-naturals 1)]
[line (in-bytes-lines i-in)])
(define record-type (subbytes line 0 2))
(match record-type
[#"00" null ]
[#"01" (parse-case e-out line-no line)
(set! num-cases (+ num-cases 1)) ]
[#"02" (parse-alias a-out line-no line)
(set! num-aliases (+ num-aliases 1)) ]
[#"99" null ]
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ]))
(fprintf e-out "\\.\n")
(fprintf a-out "\\.\n"))
(define (parse-alias a-out line-no line)
;; Only use 27 because the '@' character is in the last position!
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 20 47)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(fprintf a-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n"
(opt-field (parse-case-key line))
(opt-field (parse-alias-line (subbytes line 18 20)))
(opt-field last-name)
(opt-field last-name-soundex)
(opt-field first-name)
(opt-field first-name-soundex)
(opt-field middle-name)
(opt-field suffix)))
(define (parse-case e-out line-no line)
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 30 58)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(fprintf e-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n"
(opt-field (parse-case-key line))
(opt-field (parse-bytes line 18 30)) ; county name
(opt-field last-name)
(opt-field last-name-soundex)
(opt-field first-name)
(opt-field first-name-soundex)
(opt-field middle-name)
(opt-field suffix)
(opt-field (parse-bytes line 58 78)) ; addr_street1
(opt-field (parse-bytes line 78 93)) ; addr_street2
(opt-field (parse-bytes line 93 108)) ; city
(opt-field (parse-bytes line 108 110)) ; state
(opt-field (parse-bytes line 110 115)) ; zip5
(opt-field (parse-bytes line 115 119)) ; zip4
(opt-field (parse-bytes line 119 120)) ; race
(opt-field (parse-bytes line 120 121)) ; sex
(opt-field (parse-bytes line 121 125)) ; birth_year
(opt-field (parse-bytes line 125 127)) ; birth_month
(opt-field (parse-bytes line 127 129)) ; birth_day
(opt-field (parse-bytes line 129 133)) ; last_4_ssn
(opt-field (parse-bytes line 133 153)) ; driver_license
(opt-field (parse-bytes line 153 155)) ; driver_license_state
(opt-field (parse-bytes line 155 163)) ; citation_number
(opt-field (parse-bytes line 163 171)) ; added_to_acis_date
(opt-field (parse-bytes line 171 179)) ; extract_date
(opt-field (parse-bytes line 179 189)) ; personal_identifier
(opt-field (parse-bytes line 189 190)) ; case_disposed
(opt-field (parse-bytes line 190 193)))) ; court_type
(define (parse-alias-line bytes)
(if (= zero (bytes-ref bytes 0))
(subbytes bytes 1 2)
bytes))
(define (parse-case-key line)
(bytes-trim (subbytes line 2 18)))
(define (parse-name bytes)
(define tokens (bytes-split (bytes-delete bytes backslash) comma))
(define len (length tokens))
(values
(and (> len 0) (bytes-trim (car tokens)))
(and (> len 1) (bytes-trim (cadr tokens)))
(and (> len 2) (bytes-trim (caddr tokens)))
(and (> len 3) (bytes-trim (cadddr tokens)))))
(define (opt-field bytes)
(if (and bytes (> (bytes-length bytes) 0))
bytes
#"\\N"))
(define (parse-bytes line beg end)
(bytes-trim (bytes-delete (subbytes line beg end) backslash)))
(define (log-error proc-sym line-no str)
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str))
(define (run)
(parse-file "cr-4000"
"e1-file"
"a1-file"))
;(profile-thunk run #:use-errortrace? #t)
(time (run))
#lang racket
(require "./byte-string-support.rkt")
;; --------------------------------------------------------------------------------------------
;; Public Interface
;; --------------------------------------------------------------------------------------------
;; Convert a byte string to a soundex code. If the input contains no
;; alphabetic bytes, return #f
(define (soundex original)
(define clean (bytes-delete-pred original (λ (b) (or (< b a-code)
(> b z-code)))))
(if (non-empty-bytes? clean)
(soundex-inner clean)
#f))
;; --------------------------------------------------------------------------------------------
;; Private
;; --------------------------------------------------------------------------------------------
(define a-code (char->integer #\A))
(define e-code (char->integer #\E))
(define h-code (char->integer #\H))
(define i-code (char->integer #\I))
(define o-code (char->integer #\O))
(define u-code (char->integer #\U))
(define w-code (char->integer #\W))
(define y-code (char->integer #\Y))
(define z-code (char->integer #\Z))
(define num-letters (+ (- z-code a-code) 1))
(define soundex-chars #"ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(define soundex-codes #"A123E12HI22455O12623U1W2Y2")
(define soundex-code-length 4)
(define (soundex-inner original)
;; 1. Save the first letter.
(define first_letter (bytes-car original))
;; 2. Remove all occurrences of 'H' and 'W' EXCEPT first letter.
(define str (bytes-append
(subbytes original 0 1)
(bytes-delete-pred
(if (> (bytes-length original) 1)
(bytes-cdr original)
#"")
(λ (b) (or (= b h-code) (= b w-code))))))
;; 3. Replace all consonants (including the first letter) with digits as follows:
;; b, f, p, v -> 1
;; c, g, j, k, q, s, x, z -> 2
;; d, t -> 3
;; l -> 4
;; m, n -> 5
;; r -> 6
(set! str (bytes-tr str tr-map))
;; 4. Replace all adjacent same digits with one digit.
(set! str (bytes-squeeze str))
;; 5. Remove all occurrences of A, E, I, O, U, Y EXCEPT the first letter.
(set! str (let ([len (bytes-length str)])
(bytes-append
(subbytes str 0 1)
(bytes-delete-pred
(if (> len 1)
(subbytes str 1 len)
#"")
(λ (b) (or (= b a-code)
(= b e-code)
(= b i-code)
(= b o-code)
(= b u-code)
(= b y-code)))))))
;; 6. If first symbol is a digit replace it with letter saved on step 1.
(when (byte-numeric? (bytes-ref str 0))
(bytes-set! str 0 first_letter))
;; 7. Append zeros or truncate so that the result is a letter followed by 3 digits.
(let ([len (bytes-length str)])
(cond [(< len soundex-code-length) (subbytes (bytes-append str #"000") 0 soundex-code-length)]
[(> len soundex-code-length) (subbytes str 0 soundex-code-length)]
[else str])))
;; Create a translation map to convert letters to soundex digits.
(define tr-map
(let ([base a-code]
[v (make-vector num-letters)])
(for ([c1 (in-bytes soundex-chars)]
[c2 (in-bytes soundex-codes)])
(vector-set! v (- c1 base) c2))
v))
;; Translate bytes in a byte string based on a translation map
(define (bytes-tr bytes map)
(define result (make-bytes (bytes-length bytes)))
(for ([c (in-bytes bytes)]
[i (in-naturals)])
(let ([k (- c a-code)])
(if (<= 0 k (- num-letters 1))
(bytes-set! result i (vector-ref map k))
(bytes-set! result i c))))
result)
(provide soundex)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment