-
-
Save lojic/413f972bcaf1a6b156e2 to your computer and use it in GitHub Desktop.
Sequential version of file parser (3 files)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(define space (char->integer #\space)) | |
(define zero (char->integer #\0)) | |
(define nine (char->integer #\9)) | |
(define max-ascii 127) | |
;; Indicate wither the byte is in [ 0 .. 9 ] | |
(define (byte-numeric? b) | |
(and (>= b zero) | |
(<= b nine))) | |
;; Return the first byte of a byte string | |
(define (bytes-car bytes) | |
(bytes-ref bytes 0)) | |
;; Return all but the first byte of a byte string | |
(define (bytes-cdr bytes) | |
(subbytes bytes 1)) | |
;; Return a byte string with each occurrence removed | |
;; val: byte? | |
(define (bytes-delete bytes val) | |
(define len (bytes-length bytes)) | |
(define (has-val? i) | |
(cond [(= i len) #f] | |
[(= val (bytes-ref bytes i)) #t] | |
[else (has-val? (+ i 1))])) | |
(if (> len 0) | |
(cond [(has-val? 0) | |
(define result (make-bytes len)) | |
(define dst 0) | |
(for ([b (in-bytes bytes)]) | |
(cond [(= b val) '()] | |
[else (bytes-set! result dst b) | |
(set! dst (+ dst 1))])) | |
(subbytes result 0 dst)] | |
[else bytes]) | |
bytes)) | |
;; Return a byte string with each byte for which pred returns true removed | |
;; pred: procedure? | |
(define (bytes-delete-pred bytes pred) | |
(define len (bytes-length bytes)) | |
(define (has-val? i) | |
(cond [(= i len) #f] | |
[(pred (bytes-ref bytes i)) #t] | |
[else (has-val? (+ i 1))])) | |
(if (> len 0) | |
(cond [(has-val? 0) | |
(define result (make-bytes len)) | |
(define dst 0) | |
(for ([b (in-bytes bytes)]) | |
(cond [(pred b) '()] | |
[else (bytes-set! result dst b) | |
(set! dst (+ dst 1))])) | |
(subbytes result 0 dst)] | |
[else bytes]) | |
bytes)) | |
;; Return a byte string with each occurrence of "from" replaced by "to" | |
;; from: byte? | |
;; to: byte? | |
(define (bytes-replace bytes from to) | |
(define len (bytes-length bytes)) | |
(define (has-from? i) | |
(cond [(= i len) #f] | |
[(= from (bytes-ref bytes i)) #t] | |
[else (has-from? (+ i 1))])) | |
(cond [(has-from? 0) | |
(define result (make-bytes len)) | |
(for ([i (in-range len)] | |
[b (in-bytes bytes)]) | |
(if (= b from) | |
(bytes-set! result i to) | |
(bytes-set! result i b))) | |
result] | |
[else bytes])) | |
;; Split a byte string into a list of byte strings based on the specified sep byte. | |
;; NOTE: Only handles case of a single separator byte | |
(define (bytes-split bytes sep) | |
(define len (bytes-length bytes)) | |
(define (helper beg end result) | |
(cond | |
; Begin is beyond end of byte string | |
[(= beg len) | |
(if (= sep (bytes-ref bytes (- len 1))) | |
(reverse (cons #"" result)) | |
(reverse result))] | |
; End is beyond end of byte string | |
[(= end len) (reverse (cons (subbytes bytes beg end) result))] | |
; End is at sep | |
[(= sep (bytes-ref bytes end)) | |
(let ([i (+ end 1)]) | |
(helper i i (cons (subbytes bytes beg end) result)))] | |
; Increment end | |
[else (helper beg (+ end 1) result)])) | |
(if (> len 0) | |
(helper 0 0 '()) | |
(list bytes))) | |
;; Returns a new string where runs of the same character are replaced | |
;; by a single character. | |
;; (string-squeeze "abbaacdccee") -> "abacdce" | |
(define (bytes-squeeze bytes) | |
(define result (make-bytes (bytes-length bytes))) | |
(define last null) | |
(define i -1) | |
(for ([c (in-bytes bytes)]) | |
(when (not (eqv? c last)) | |
(set! i (+ i 1)) | |
(set! last c) | |
(bytes-set! result i c))) | |
(subbytes result 0 (+ i 1))) | |
;; Trims leading/trailing bytes that are <= <space> or > <max ascii> | |
(define (bytes-trim bytes) | |
(define len (bytes-length bytes)) | |
(let left ([i 0]) | |
(if (< i len) | |
(let ([left-byte (bytes-ref bytes i)]) | |
(if (or (<= left-byte space) | |
(> left-byte max-ascii)) | |
(left (+ i 1)) | |
(let right ([j (- len 1)]) | |
(if (>= j i) | |
(let ([right-byte (bytes-ref bytes j)]) | |
(if (or (<= right-byte space) | |
(> right-byte max-ascii)) | |
(right (- j 1)) | |
(subbytes bytes i (+ j 1)))) | |
#"")))) | |
#""))) | |
;; Indicate whether the argument is a byte string and is not empty | |
(define (non-empty-bytes? bytes) | |
(and (bytes? bytes) | |
(> (bytes-length bytes) 0))) | |
(provide byte-numeric? bytes-car bytes-cdr bytes-delete bytes-delete-pred bytes-replace bytes-split bytes-squeeze bytes-trim max-ascii non-empty-bytes? space) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require "./byte-string-support.rkt") | |
(require "./soundex.rkt") | |
;(require profile) | |
(define backslash (char->integer #\\)) | |
(define comma (char->integer #\,)) | |
(define zero (char->integer #\0)) | |
(define (parse-file ipath epath apath) | |
(call-with-input-file ipath | |
(λ (i-in) | |
(call-with-output-file epath | |
(λ (e-out) | |
(call-with-output-file apath | |
(λ (a-out) | |
(parse-records i-in e-out a-out)) | |
#:exists 'replace)) | |
#:exists 'replace)))) | |
(define (parse-records i-in e-out a-out) | |
(define num-deletes 0) | |
(define num-cases 0) | |
(define num-aliases 0) | |
;; Extract header | |
(fprintf e-out | |
"~a\n" | |
(string-append | |
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name," | |
"first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2," | |
"addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day," | |
"last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date," | |
"extract_date,personal_identifier,case_disposed,court_type) from stdin;")) | |
;; SubjectAlias header | |
(fprintf a-out | |
"~a\n" | |
(string-append | |
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name," | |
"first_name_soundex,middle_name,suffix) from stdin;")) | |
;; Main loop | |
(for ([line-no (in-naturals 1)] | |
[line (in-bytes-lines i-in)]) | |
(define record-type (subbytes line 0 2)) | |
(match record-type | |
[#"00" null ] | |
[#"01" (parse-case e-out line-no line) | |
(set! num-cases (+ num-cases 1)) ] | |
[#"02" (parse-alias a-out line-no line) | |
(set! num-aliases (+ num-aliases 1)) ] | |
[#"99" null ] | |
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ])) | |
(fprintf e-out "\\.\n") | |
(fprintf a-out "\\.\n")) | |
(define (parse-alias a-out line-no line) | |
;; Only use 27 because the '@' character is in the last position! | |
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 20 47))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(fprintf a-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n" | |
(opt-field (parse-case-key line)) | |
(opt-field (parse-alias-line (subbytes line 18 20))) | |
(opt-field last-name) | |
(opt-field last-name-soundex) | |
(opt-field first-name) | |
(opt-field first-name-soundex) | |
(opt-field middle-name) | |
(opt-field suffix))) | |
(define (parse-case e-out line-no line) | |
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 30 58))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(fprintf e-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n" | |
(opt-field (parse-case-key line)) | |
(opt-field (parse-bytes line 18 30)) ; county name | |
(opt-field last-name) | |
(opt-field last-name-soundex) | |
(opt-field first-name) | |
(opt-field first-name-soundex) | |
(opt-field middle-name) | |
(opt-field suffix) | |
(opt-field (parse-bytes line 58 78)) ; addr_street1 | |
(opt-field (parse-bytes line 78 93)) ; addr_street2 | |
(opt-field (parse-bytes line 93 108)) ; city | |
(opt-field (parse-bytes line 108 110)) ; state | |
(opt-field (parse-bytes line 110 115)) ; zip5 | |
(opt-field (parse-bytes line 115 119)) ; zip4 | |
(opt-field (parse-bytes line 119 120)) ; race | |
(opt-field (parse-bytes line 120 121)) ; sex | |
(opt-field (parse-bytes line 121 125)) ; birth_year | |
(opt-field (parse-bytes line 125 127)) ; birth_month | |
(opt-field (parse-bytes line 127 129)) ; birth_day | |
(opt-field (parse-bytes line 129 133)) ; last_4_ssn | |
(opt-field (parse-bytes line 133 153)) ; driver_license | |
(opt-field (parse-bytes line 153 155)) ; driver_license_state | |
(opt-field (parse-bytes line 155 163)) ; citation_number | |
(opt-field (parse-bytes line 163 171)) ; added_to_acis_date | |
(opt-field (parse-bytes line 171 179)) ; extract_date | |
(opt-field (parse-bytes line 179 189)) ; personal_identifier | |
(opt-field (parse-bytes line 189 190)) ; case_disposed | |
(opt-field (parse-bytes line 190 193)))) ; court_type | |
(define (parse-alias-line bytes) | |
(if (= zero (bytes-ref bytes 0)) | |
(subbytes bytes 1 2) | |
bytes)) | |
(define (parse-case-key line) | |
(bytes-trim (subbytes line 2 18))) | |
(define (parse-name bytes) | |
(define tokens (bytes-split (bytes-delete bytes backslash) comma)) | |
(define len (length tokens)) | |
(values | |
(and (> len 0) (bytes-trim (car tokens))) | |
(and (> len 1) (bytes-trim (cadr tokens))) | |
(and (> len 2) (bytes-trim (caddr tokens))) | |
(and (> len 3) (bytes-trim (cadddr tokens))))) | |
(define (opt-field bytes) | |
(if (and bytes (> (bytes-length bytes) 0)) | |
bytes | |
#"\\N")) | |
(define (parse-bytes line beg end) | |
(bytes-trim (bytes-delete (subbytes line beg end) backslash))) | |
(define (log-error proc-sym line-no str) | |
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str)) | |
(define (run) | |
(parse-file "cr-4000" | |
"e1-file" | |
"a1-file")) | |
;(profile-thunk run #:use-errortrace? #t) | |
(time (run)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require "./byte-string-support.rkt") | |
;; -------------------------------------------------------------------------------------------- | |
;; Public Interface | |
;; -------------------------------------------------------------------------------------------- | |
;; Convert a byte string to a soundex code. If the input contains no | |
;; alphabetic bytes, return #f | |
(define (soundex original) | |
(define clean (bytes-delete-pred original (λ (b) (or (< b a-code) | |
(> b z-code))))) | |
(if (non-empty-bytes? clean) | |
(soundex-inner clean) | |
#f)) | |
;; -------------------------------------------------------------------------------------------- | |
;; Private | |
;; -------------------------------------------------------------------------------------------- | |
(define a-code (char->integer #\A)) | |
(define e-code (char->integer #\E)) | |
(define h-code (char->integer #\H)) | |
(define i-code (char->integer #\I)) | |
(define o-code (char->integer #\O)) | |
(define u-code (char->integer #\U)) | |
(define w-code (char->integer #\W)) | |
(define y-code (char->integer #\Y)) | |
(define z-code (char->integer #\Z)) | |
(define num-letters (+ (- z-code a-code) 1)) | |
(define soundex-chars #"ABCDEFGHIJKLMNOPQRSTUVWXYZ") | |
(define soundex-codes #"A123E12HI22455O12623U1W2Y2") | |
(define soundex-code-length 4) | |
(define (soundex-inner original) | |
;; 1. Save the first letter. | |
(define first_letter (bytes-car original)) | |
;; 2. Remove all occurrences of 'H' and 'W' EXCEPT first letter. | |
(define str (bytes-append | |
(subbytes original 0 1) | |
(bytes-delete-pred | |
(if (> (bytes-length original) 1) | |
(bytes-cdr original) | |
#"") | |
(λ (b) (or (= b h-code) (= b w-code)))))) | |
;; 3. Replace all consonants (including the first letter) with digits as follows: | |
;; b, f, p, v -> 1 | |
;; c, g, j, k, q, s, x, z -> 2 | |
;; d, t -> 3 | |
;; l -> 4 | |
;; m, n -> 5 | |
;; r -> 6 | |
(set! str (bytes-tr str tr-map)) | |
;; 4. Replace all adjacent same digits with one digit. | |
(set! str (bytes-squeeze str)) | |
;; 5. Remove all occurrences of A, E, I, O, U, Y EXCEPT the first letter. | |
(set! str (let ([len (bytes-length str)]) | |
(bytes-append | |
(subbytes str 0 1) | |
(bytes-delete-pred | |
(if (> len 1) | |
(subbytes str 1 len) | |
#"") | |
(λ (b) (or (= b a-code) | |
(= b e-code) | |
(= b i-code) | |
(= b o-code) | |
(= b u-code) | |
(= b y-code))))))) | |
;; 6. If first symbol is a digit replace it with letter saved on step 1. | |
(when (byte-numeric? (bytes-ref str 0)) | |
(bytes-set! str 0 first_letter)) | |
;; 7. Append zeros or truncate so that the result is a letter followed by 3 digits. | |
(let ([len (bytes-length str)]) | |
(cond [(< len soundex-code-length) (subbytes (bytes-append str #"000") 0 soundex-code-length)] | |
[(> len soundex-code-length) (subbytes str 0 soundex-code-length)] | |
[else str]))) | |
;; Create a translation map to convert letters to soundex digits. | |
(define tr-map | |
(let ([base a-code] | |
[v (make-vector num-letters)]) | |
(for ([c1 (in-bytes soundex-chars)] | |
[c2 (in-bytes soundex-codes)]) | |
(vector-set! v (- c1 base) c2)) | |
v)) | |
;; Translate bytes in a byte string based on a translation map | |
(define (bytes-tr bytes map) | |
(define result (make-bytes (bytes-length bytes))) | |
(for ([c (in-bytes bytes)] | |
[i (in-naturals)]) | |
(let ([k (- c a-code)]) | |
(if (<= 0 k (- num-letters 1)) | |
(bytes-set! result i (vector-ref map k)) | |
(bytes-set! result i c)))) | |
result) | |
(provide soundex) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment