-
-
Save lojic/283aa3eec777e4810efc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require "./byte-string-support.rkt") | |
(require "./soundex.rkt") | |
;(require profile) | |
(define backslash (char->integer #\\)) | |
(define comma (char->integer #\,)) | |
(define zero (char->integer #\0)) | |
(define (parse-file ipath epath apath) | |
(call-with-input-file ipath | |
(λ (i-in) | |
(call-with-output-file epath | |
(λ (e-out) | |
(call-with-output-file apath | |
(λ (a-out) | |
(parse-records i-in e-out a-out)) | |
#:exists 'replace)) | |
#:exists 'replace)))) | |
(define (parse-records i-in e-out a-out) | |
(define num-deletes 0) | |
(define num-cases 0) | |
(define num-aliases 0) | |
;; Extract header | |
(fprintf e-out | |
"~a\n" | |
(string-append | |
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name," | |
"first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2," | |
"addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day," | |
"last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date," | |
"extract_date,personal_identifier,case_disposed,court_type) from stdin;")) | |
;; SubjectAlias header | |
(fprintf a-out | |
"~a\n" | |
(string-append | |
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name," | |
"first_name_soundex,middle_name,suffix) from stdin;")) | |
;; Start output place | |
(define output-place | |
(place ch | |
(define e-out (place-channel-get ch)) | |
(define (helper) | |
(let ([bytes (place-channel-get ch)]) | |
(when bytes | |
(write-bytes bytes e-out) | |
(helper)))) | |
(helper))) | |
(place-channel-put output-place e-out) | |
;; Start workers | |
(define w1 | |
(place ch | |
(define out-place (place-channel-get ch)) | |
(define (helper) | |
(place-channel-put out-place (parse-case (place-channel-get ch))) | |
(helper)) | |
(helper))) | |
(place-channel-put w1 output-place) | |
(define w2 | |
(place ch | |
(define out-place (place-channel-get ch)) | |
(define (helper) | |
(place-channel-put out-place (parse-case (place-channel-get ch))) | |
(helper)) | |
(helper))) | |
(place-channel-put w2 output-place) | |
(define w3 | |
(place ch | |
(define out-place (place-channel-get ch)) | |
(define (helper) | |
(place-channel-put out-place (parse-case (place-channel-get ch))) | |
(helper)) | |
(helper))) | |
(place-channel-put w3 output-place) | |
(define w4 | |
(place ch | |
(define out-place (place-channel-get ch)) | |
(define (helper) | |
(place-channel-put out-place (parse-case (place-channel-get ch))) | |
(helper)) | |
(helper))) | |
(place-channel-put w4 output-place) | |
(define workers (vector-immutable w1 w2 w3 w4)) | |
(define num-workers (vector-length workers)) | |
;; Main loop | |
(for ([line-no (in-naturals 1)] | |
[line (in-bytes-lines i-in)]) | |
(define record-type (subbytes line 0 2)) | |
(match record-type | |
[#"00" null ] | |
[#"01" | |
(place-channel-put (vector-ref workers (remainder line-no num-workers)) line) | |
(set! num-cases (+ num-cases 1)) ] | |
[#"02" (write-bytes (parse-alias line) a-out) | |
(set! num-aliases (+ num-aliases 1)) ] | |
[#"99" null ] | |
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ])) | |
(fprintf e-out "\\.\n") | |
(fprintf a-out "\\.\n")) | |
(define (parse-alias line) | |
;; Only use 27 because the '@' character is in the last position! | |
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 20 47))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(bytes-append | |
(opt-field (parse-case-key line)) | |
#"\t" | |
(opt-field (parse-alias-line (subbytes line 18 20))) | |
#"\t" | |
(opt-field last-name) | |
#"\t" | |
(opt-field last-name-soundex) | |
#"\t" | |
(opt-field first-name) | |
#"\t" | |
(opt-field first-name-soundex) | |
#"\t" | |
(opt-field middle-name) | |
#"\t" | |
(opt-field suffix) | |
#"\n")) | |
(define (parse-case line) | |
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 30 58))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(bytes-append | |
(opt-field (parse-case-key line)) | |
#"\t" | |
(opt-field (parse-bytes line 18 30)) ; county name | |
#"\t" | |
(opt-field last-name) | |
#"\t" | |
(opt-field last-name-soundex) | |
#"\t" | |
(opt-field first-name) | |
#"\t" | |
(opt-field first-name-soundex) | |
#"\t" | |
(opt-field middle-name) | |
#"\t" | |
(opt-field suffix) | |
#"\t" | |
(opt-field (parse-bytes line 58 78)) ; addr_street1 | |
#"\t" | |
(opt-field (parse-bytes line 78 93)) ; addr_street2 | |
#"\t" | |
(opt-field (parse-bytes line 93 108)) ; city | |
#"\t" | |
(opt-field (parse-bytes line 108 110)) ; state | |
#"\t" | |
(opt-field (parse-bytes line 110 115)) ; zip5 | |
#"\t" | |
(opt-field (parse-bytes line 115 119)) ; zip4 | |
#"\t" | |
(opt-field (parse-bytes line 119 120)) ; race | |
#"\t" | |
(opt-field (parse-bytes line 120 121)) ; sex | |
#"\t" | |
(opt-field (parse-bytes line 121 125)) ; birth_year | |
#"\t" | |
(opt-field (parse-bytes line 125 127)) ; birth_month | |
#"\t" | |
(opt-field (parse-bytes line 127 129)) ; birth_day | |
#"\t" | |
(opt-field (parse-bytes line 129 133)) ; last_4_ssn | |
#"\t" | |
(opt-field (parse-bytes line 133 153)) ; driver_license | |
#"\t" | |
(opt-field (parse-bytes line 153 155)) ; driver_license_state | |
#"\t" | |
(opt-field (parse-bytes line 155 163)) ; citation_number | |
#"\t" | |
(opt-field (parse-bytes line 163 171)) ; added_to_acis_date | |
#"\t" | |
(opt-field (parse-bytes line 171 179)) ; extract_date | |
#"\t" | |
(opt-field (parse-bytes line 179 189)) ; personal_identifier | |
#"\t" | |
(opt-field (parse-bytes line 189 190)) ; case_disposed | |
#"\t" | |
(opt-field (parse-bytes line 190 193)) ; court_type | |
#"\n")) | |
(define (parse-alias-line bytes) | |
(if (= zero (bytes-ref bytes 0)) | |
(subbytes bytes 1 2) | |
bytes)) | |
(define (parse-case-key line) | |
(bytes-trim (subbytes line 2 18))) | |
(define (parse-name bytes) | |
(define tokens (bytes-split (bytes-delete bytes backslash) comma)) | |
(define len (length tokens)) | |
(values | |
(and (> len 0) (bytes-trim (car tokens))) | |
(and (> len 1) (bytes-trim (cadr tokens))) | |
(and (> len 2) (bytes-trim (caddr tokens))) | |
(and (> len 3) (bytes-trim (cadddr tokens))))) | |
(define (opt-field bytes) | |
(if (and bytes (> (bytes-length bytes) 0)) | |
bytes | |
#"\\N")) | |
(define (parse-bytes line beg end) | |
(bytes-trim (bytes-delete (subbytes line beg end) backslash))) | |
(define (log-error proc-sym line-no str) | |
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str)) | |
(define (main) | |
(time (parse-file "cr-200" | |
"e1-file" | |
"a1-file"))) | |
(provide main) | |
;(profile-thunk run #:use-errortrace? #t) | |
;(time (run)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require "./byte-string-support.rkt") | |
(require "./soundex.rkt") | |
;(require profile) | |
(define backslash (char->integer #\\)) | |
(define comma (char->integer #\,)) | |
(define zero (char->integer #\0)) | |
(define (parse-file ipath epath apath) | |
(call-with-input-file ipath | |
(λ (i-in) | |
(call-with-output-file epath | |
(λ (e-out) | |
(call-with-output-file apath | |
(λ (a-out) | |
(parse-records i-in e-out a-out)) | |
#:exists 'replace)) | |
#:exists 'replace)))) | |
(define (parse-records i-in e-out a-out) | |
(define num-deletes 0) | |
(define num-cases 0) | |
(define num-aliases 0) | |
;; Extract header | |
(fprintf e-out | |
"~a\n" | |
(string-append | |
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name," | |
"first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2," | |
"addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day," | |
"last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date," | |
"extract_date,personal_identifier,case_disposed,court_type) from stdin;")) | |
;; SubjectAlias header | |
(fprintf a-out | |
"~a\n" | |
(string-append | |
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name," | |
"first_name_soundex,middle_name,suffix) from stdin;")) | |
;; Main loop | |
(for ([line-no (in-naturals 1)] | |
[line (in-bytes-lines i-in)]) | |
(define record-type (subbytes line 0 2)) | |
(match record-type | |
[#"00" null ] | |
[#"01" (parse-case e-out line-no line) | |
(set! num-cases (+ num-cases 1)) ] | |
[#"02" (parse-alias a-out line-no line) | |
(set! num-aliases (+ num-aliases 1)) ] | |
[#"99" null ] | |
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ])) | |
(fprintf e-out "\\.\n") | |
(fprintf a-out "\\.\n")) | |
(define (parse-alias a-out line-no line) | |
;; Only use 27 because the '@' character is in the last position! | |
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 20 47))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(write-bytes (bytes-append | |
(opt-field (parse-case-key line)) | |
#"\t" | |
(opt-field (parse-alias-line (subbytes line 18 20))) | |
#"\t" | |
(opt-field last-name) | |
#"\t" | |
(opt-field last-name-soundex) | |
#"\t" | |
(opt-field first-name) | |
#"\t" | |
(opt-field first-name-soundex) | |
#"\t" | |
(opt-field middle-name) | |
#"\t" | |
(opt-field suffix) | |
#"\n") | |
a-out)) | |
(define (parse-case e-out line-no line) | |
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 30 58))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(write-bytes (bytes-append | |
(opt-field (parse-case-key line)) | |
#"\t" | |
(opt-field (parse-bytes line 18 30)) ; county name | |
#"\t" | |
(opt-field last-name) | |
#"\t" | |
(opt-field last-name-soundex) | |
#"\t" | |
(opt-field first-name) | |
#"\t" | |
(opt-field first-name-soundex) | |
#"\t" | |
(opt-field middle-name) | |
#"\t" | |
(opt-field suffix) | |
#"\t" | |
(opt-field (parse-bytes line 58 78)) ; addr_street1 | |
#"\t" | |
(opt-field (parse-bytes line 78 93)) ; addr_street2 | |
#"\t" | |
(opt-field (parse-bytes line 93 108)) ; city | |
#"\t" | |
(opt-field (parse-bytes line 108 110)) ; state | |
#"\t" | |
(opt-field (parse-bytes line 110 115)) ; zip5 | |
#"\t" | |
(opt-field (parse-bytes line 115 119)) ; zip4 | |
#"\t" | |
(opt-field (parse-bytes line 119 120)) ; race | |
#"\t" | |
(opt-field (parse-bytes line 120 121)) ; sex | |
#"\t" | |
(opt-field (parse-bytes line 121 125)) ; birth_year | |
#"\t" | |
(opt-field (parse-bytes line 125 127)) ; birth_month | |
#"\t" | |
(opt-field (parse-bytes line 127 129)) ; birth_day | |
#"\t" | |
(opt-field (parse-bytes line 129 133)) ; last_4_ssn | |
#"\t" | |
(opt-field (parse-bytes line 133 153)) ; driver_license | |
#"\t" | |
(opt-field (parse-bytes line 153 155)) ; driver_license_state | |
#"\t" | |
(opt-field (parse-bytes line 155 163)) ; citation_number | |
#"\t" | |
(opt-field (parse-bytes line 163 171)) ; added_to_acis_date | |
#"\t" | |
(opt-field (parse-bytes line 171 179)) ; extract_date | |
#"\t" | |
(opt-field (parse-bytes line 179 189)) ; personal_identifier | |
#"\t" | |
(opt-field (parse-bytes line 189 190)) ; case_disposed | |
#"\t" | |
(opt-field (parse-bytes line 190 193)) ; court_type | |
#"\n") | |
e-out)) | |
(define (parse-alias-line bytes) | |
(if (= zero (bytes-ref bytes 0)) | |
(subbytes bytes 1 2) | |
bytes)) | |
(define (parse-case-key line) | |
(bytes-trim (subbytes line 2 18))) | |
(define (parse-name bytes) | |
(define tokens (bytes-split (bytes-delete bytes backslash) comma)) | |
(define len (length tokens)) | |
(values | |
(and (> len 0) (bytes-trim (car tokens))) | |
(and (> len 1) (bytes-trim (cadr tokens))) | |
(and (> len 2) (bytes-trim (caddr tokens))) | |
(and (> len 3) (bytes-trim (cadddr tokens))))) | |
(define (opt-field bytes) | |
(if (and bytes (> (bytes-length bytes) 0)) | |
bytes | |
#"\\N")) | |
(define (parse-bytes line beg end) | |
(bytes-trim (bytes-delete (subbytes line beg end) backslash))) | |
(define (log-error proc-sym line-no str) | |
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str)) | |
(define (main) | |
(time (parse-file "cr-200" | |
"e1-file" | |
"a1-file"))) | |
(provide main) | |
;(profile-thunk run #:use-errortrace? #t) | |
;(time (run)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment