Skip to content

Instantly share code, notes, and snippets.

@lojic
Created January 21, 2016 04:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lojic/283aa3eec777e4810efc to your computer and use it in GitHub Desktop.
Save lojic/283aa3eec777e4810efc to your computer and use it in GitHub Desktop.
#lang racket
(require "./byte-string-support.rkt")
(require "./soundex.rkt")
;(require profile)
(define backslash (char->integer #\\))
(define comma (char->integer #\,))
(define zero (char->integer #\0))
(define (parse-file ipath epath apath)
(call-with-input-file ipath
(λ (i-in)
(call-with-output-file epath
(λ (e-out)
(call-with-output-file apath
(λ (a-out)
(parse-records i-in e-out a-out))
#:exists 'replace))
#:exists 'replace))))
(define (parse-records i-in e-out a-out)
(define num-deletes 0)
(define num-cases 0)
(define num-aliases 0)
;; Extract header
(fprintf e-out
"~a\n"
(string-append
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,"
"first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,"
"addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,"
"last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,"
"extract_date,personal_identifier,case_disposed,court_type) from stdin;"))
;; SubjectAlias header
(fprintf a-out
"~a\n"
(string-append
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,"
"first_name_soundex,middle_name,suffix) from stdin;"))
;; Start output place
(define output-place
(place ch
(define e-out (place-channel-get ch))
(define (helper)
(let ([bytes (place-channel-get ch)])
(when bytes
(write-bytes bytes e-out)
(helper))))
(helper)))
(place-channel-put output-place e-out)
;; Start workers
(define w1
(place ch
(define out-place (place-channel-get ch))
(define (helper)
(place-channel-put out-place (parse-case (place-channel-get ch)))
(helper))
(helper)))
(place-channel-put w1 output-place)
(define w2
(place ch
(define out-place (place-channel-get ch))
(define (helper)
(place-channel-put out-place (parse-case (place-channel-get ch)))
(helper))
(helper)))
(place-channel-put w2 output-place)
(define w3
(place ch
(define out-place (place-channel-get ch))
(define (helper)
(place-channel-put out-place (parse-case (place-channel-get ch)))
(helper))
(helper)))
(place-channel-put w3 output-place)
(define w4
(place ch
(define out-place (place-channel-get ch))
(define (helper)
(place-channel-put out-place (parse-case (place-channel-get ch)))
(helper))
(helper)))
(place-channel-put w4 output-place)
(define workers (vector-immutable w1 w2 w3 w4))
(define num-workers (vector-length workers))
;; Main loop
(for ([line-no (in-naturals 1)]
[line (in-bytes-lines i-in)])
(define record-type (subbytes line 0 2))
(match record-type
[#"00" null ]
[#"01"
(place-channel-put (vector-ref workers (remainder line-no num-workers)) line)
(set! num-cases (+ num-cases 1)) ]
[#"02" (write-bytes (parse-alias line) a-out)
(set! num-aliases (+ num-aliases 1)) ]
[#"99" null ]
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ]))
(fprintf e-out "\\.\n")
(fprintf a-out "\\.\n"))
(define (parse-alias line)
;; Only use 27 because the '@' character is in the last position!
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 20 47)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(bytes-append
(opt-field (parse-case-key line))
#"\t"
(opt-field (parse-alias-line (subbytes line 18 20)))
#"\t"
(opt-field last-name)
#"\t"
(opt-field last-name-soundex)
#"\t"
(opt-field first-name)
#"\t"
(opt-field first-name-soundex)
#"\t"
(opt-field middle-name)
#"\t"
(opt-field suffix)
#"\n"))
(define (parse-case line)
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 30 58)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(bytes-append
(opt-field (parse-case-key line))
#"\t"
(opt-field (parse-bytes line 18 30)) ; county name
#"\t"
(opt-field last-name)
#"\t"
(opt-field last-name-soundex)
#"\t"
(opt-field first-name)
#"\t"
(opt-field first-name-soundex)
#"\t"
(opt-field middle-name)
#"\t"
(opt-field suffix)
#"\t"
(opt-field (parse-bytes line 58 78)) ; addr_street1
#"\t"
(opt-field (parse-bytes line 78 93)) ; addr_street2
#"\t"
(opt-field (parse-bytes line 93 108)) ; city
#"\t"
(opt-field (parse-bytes line 108 110)) ; state
#"\t"
(opt-field (parse-bytes line 110 115)) ; zip5
#"\t"
(opt-field (parse-bytes line 115 119)) ; zip4
#"\t"
(opt-field (parse-bytes line 119 120)) ; race
#"\t"
(opt-field (parse-bytes line 120 121)) ; sex
#"\t"
(opt-field (parse-bytes line 121 125)) ; birth_year
#"\t"
(opt-field (parse-bytes line 125 127)) ; birth_month
#"\t"
(opt-field (parse-bytes line 127 129)) ; birth_day
#"\t"
(opt-field (parse-bytes line 129 133)) ; last_4_ssn
#"\t"
(opt-field (parse-bytes line 133 153)) ; driver_license
#"\t"
(opt-field (parse-bytes line 153 155)) ; driver_license_state
#"\t"
(opt-field (parse-bytes line 155 163)) ; citation_number
#"\t"
(opt-field (parse-bytes line 163 171)) ; added_to_acis_date
#"\t"
(opt-field (parse-bytes line 171 179)) ; extract_date
#"\t"
(opt-field (parse-bytes line 179 189)) ; personal_identifier
#"\t"
(opt-field (parse-bytes line 189 190)) ; case_disposed
#"\t"
(opt-field (parse-bytes line 190 193)) ; court_type
#"\n"))
(define (parse-alias-line bytes)
(if (= zero (bytes-ref bytes 0))
(subbytes bytes 1 2)
bytes))
(define (parse-case-key line)
(bytes-trim (subbytes line 2 18)))
(define (parse-name bytes)
(define tokens (bytes-split (bytes-delete bytes backslash) comma))
(define len (length tokens))
(values
(and (> len 0) (bytes-trim (car tokens)))
(and (> len 1) (bytes-trim (cadr tokens)))
(and (> len 2) (bytes-trim (caddr tokens)))
(and (> len 3) (bytes-trim (cadddr tokens)))))
(define (opt-field bytes)
(if (and bytes (> (bytes-length bytes) 0))
bytes
#"\\N"))
(define (parse-bytes line beg end)
(bytes-trim (bytes-delete (subbytes line beg end) backslash)))
(define (log-error proc-sym line-no str)
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str))
(define (main)
(time (parse-file "cr-200"
"e1-file"
"a1-file")))
(provide main)
;(profile-thunk run #:use-errortrace? #t)
;(time (run))
#lang racket
(require "./byte-string-support.rkt")
(require "./soundex.rkt")
;(require profile)
(define backslash (char->integer #\\))
(define comma (char->integer #\,))
(define zero (char->integer #\0))
(define (parse-file ipath epath apath)
(call-with-input-file ipath
(λ (i-in)
(call-with-output-file epath
(λ (e-out)
(call-with-output-file apath
(λ (a-out)
(parse-records i-in e-out a-out))
#:exists 'replace))
#:exists 'replace))))
(define (parse-records i-in e-out a-out)
(define num-deletes 0)
(define num-cases 0)
(define num-aliases 0)
;; Extract header
(fprintf e-out
"~a\n"
(string-append
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,"
"first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,"
"addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,"
"last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,"
"extract_date,personal_identifier,case_disposed,court_type) from stdin;"))
;; SubjectAlias header
(fprintf a-out
"~a\n"
(string-append
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,"
"first_name_soundex,middle_name,suffix) from stdin;"))
;; Main loop
(for ([line-no (in-naturals 1)]
[line (in-bytes-lines i-in)])
(define record-type (subbytes line 0 2))
(match record-type
[#"00" null ]
[#"01" (parse-case e-out line-no line)
(set! num-cases (+ num-cases 1)) ]
[#"02" (parse-alias a-out line-no line)
(set! num-aliases (+ num-aliases 1)) ]
[#"99" null ]
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ]))
(fprintf e-out "\\.\n")
(fprintf a-out "\\.\n"))
(define (parse-alias a-out line-no line)
;; Only use 27 because the '@' character is in the last position!
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 20 47)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(write-bytes (bytes-append
(opt-field (parse-case-key line))
#"\t"
(opt-field (parse-alias-line (subbytes line 18 20)))
#"\t"
(opt-field last-name)
#"\t"
(opt-field last-name-soundex)
#"\t"
(opt-field first-name)
#"\t"
(opt-field first-name-soundex)
#"\t"
(opt-field middle-name)
#"\t"
(opt-field suffix)
#"\n")
a-out))
(define (parse-case e-out line-no line)
(define-values (last-name first-name middle-name suffix) (parse-name (subbytes line 30 58)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(write-bytes (bytes-append
(opt-field (parse-case-key line))
#"\t"
(opt-field (parse-bytes line 18 30)) ; county name
#"\t"
(opt-field last-name)
#"\t"
(opt-field last-name-soundex)
#"\t"
(opt-field first-name)
#"\t"
(opt-field first-name-soundex)
#"\t"
(opt-field middle-name)
#"\t"
(opt-field suffix)
#"\t"
(opt-field (parse-bytes line 58 78)) ; addr_street1
#"\t"
(opt-field (parse-bytes line 78 93)) ; addr_street2
#"\t"
(opt-field (parse-bytes line 93 108)) ; city
#"\t"
(opt-field (parse-bytes line 108 110)) ; state
#"\t"
(opt-field (parse-bytes line 110 115)) ; zip5
#"\t"
(opt-field (parse-bytes line 115 119)) ; zip4
#"\t"
(opt-field (parse-bytes line 119 120)) ; race
#"\t"
(opt-field (parse-bytes line 120 121)) ; sex
#"\t"
(opt-field (parse-bytes line 121 125)) ; birth_year
#"\t"
(opt-field (parse-bytes line 125 127)) ; birth_month
#"\t"
(opt-field (parse-bytes line 127 129)) ; birth_day
#"\t"
(opt-field (parse-bytes line 129 133)) ; last_4_ssn
#"\t"
(opt-field (parse-bytes line 133 153)) ; driver_license
#"\t"
(opt-field (parse-bytes line 153 155)) ; driver_license_state
#"\t"
(opt-field (parse-bytes line 155 163)) ; citation_number
#"\t"
(opt-field (parse-bytes line 163 171)) ; added_to_acis_date
#"\t"
(opt-field (parse-bytes line 171 179)) ; extract_date
#"\t"
(opt-field (parse-bytes line 179 189)) ; personal_identifier
#"\t"
(opt-field (parse-bytes line 189 190)) ; case_disposed
#"\t"
(opt-field (parse-bytes line 190 193)) ; court_type
#"\n")
e-out))
(define (parse-alias-line bytes)
(if (= zero (bytes-ref bytes 0))
(subbytes bytes 1 2)
bytes))
(define (parse-case-key line)
(bytes-trim (subbytes line 2 18)))
(define (parse-name bytes)
(define tokens (bytes-split (bytes-delete bytes backslash) comma))
(define len (length tokens))
(values
(and (> len 0) (bytes-trim (car tokens)))
(and (> len 1) (bytes-trim (cadr tokens)))
(and (> len 2) (bytes-trim (caddr tokens)))
(and (> len 3) (bytes-trim (cadddr tokens)))))
(define (opt-field bytes)
(if (and bytes (> (bytes-length bytes) 0))
bytes
#"\\N"))
(define (parse-bytes line beg end)
(bytes-trim (bytes-delete (subbytes line beg end) backslash)))
(define (log-error proc-sym line-no str)
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str))
(define (main)
(time (parse-file "cr-200"
"e1-file"
"a1-file")))
(provide main)
;(profile-thunk run #:use-errortrace? #t)
;(time (run))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment