-
-
Save lojic/f306104846d516761952 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
;(require profile) | |
(define (parse-file ipath epath apath) | |
(call-with-input-file ipath | |
(λ (i-in) | |
(call-with-output-file epath | |
(λ (e-out) | |
(call-with-output-file apath | |
(λ (a-out) | |
(parse-records i-in e-out a-out)) | |
#:exists 'replace)) | |
#:exists 'replace)))) | |
(define (parse-records i-in e-out a-out) | |
(define line-no 1) | |
(define num-deletes 0) | |
(define num-cases 0) | |
(define num-aliases 0) | |
;; Extract header | |
(fprintf e-out | |
"~a\n" | |
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,extract_date,personal_identifier,case_disposed,court_type) from stdin;") | |
;; SubjectAlias header | |
(fprintf a-out | |
"~a\n" | |
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,suffix) from stdin;") | |
(for ([line (in-lines i-in)]) | |
(define record-type (substring line 0 2)) | |
(match record-type | |
["00" null ] | |
["01" (parse-case e-out line-no line) | |
(set! num-cases (+ num-cases 1)) ] | |
["02" (parse-alias a-out line-no line) | |
(set! num-aliases (+ num-aliases 1)) ] | |
["99" null ] | |
[eof-str (log-error parse-records line-no "premature end of file") ] | |
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ]) | |
(set! line-no (+ line-no 1))) | |
(fprintf e-out "\\.\n") | |
(fprintf a-out "\\.\n")) | |
(define (parse-alias a-out line-no line) | |
;; Only use 27 because the '@' character is in the last position! | |
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 20 47))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(fprintf a-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n" | |
(opt-field (parse-case-key line)) | |
(string->number (substring line 18 20)) | |
(opt-field last-name) | |
(opt-field last-name-soundex) | |
(opt-field first-name) | |
(opt-field first-name-soundex) | |
(opt-field middle-name) | |
(opt-field suffix))) | |
(define (parse-case e-out line-no line) | |
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 30 58))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(fprintf e-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n" | |
(opt-field (parse-case-key line)) | |
(opt-field (parse-string line 18 30)) ; county name | |
(opt-field last-name) | |
(opt-field last-name-soundex) | |
(opt-field first-name) | |
(opt-field first-name-soundex) | |
(opt-field middle-name) | |
(opt-field suffix) | |
(opt-field (parse-string line 58 78)) ; addr_street1 | |
(opt-field (parse-string line 78 93)) ; addr_street2 | |
(opt-field (parse-string line 93 108)) ; city | |
(opt-field (parse-string line 108 110)) ; state | |
(opt-field (parse-string line 110 115)) ; zip5 | |
(opt-field (parse-string line 115 119)) ; zip4 | |
(opt-field (parse-string line 119 120)) ; race | |
(opt-field (parse-string line 120 121)) ; sex | |
(opt-field (parse-string line 121 125)) ; birth_year | |
(opt-field (parse-string line 125 127)) ; birth_month | |
(opt-field (parse-string line 127 129)) ; birth_day | |
(opt-field (parse-string line 129 133)) ; last_4_ssn | |
(opt-field (parse-string line 133 153)) ; driver_license | |
(opt-field (parse-string line 153 155)) ; driver_license_state | |
(opt-field (parse-string line 155 163)) ; citation_number | |
(opt-field (parse-string line 163 171)) ; added_to_acis_date | |
(opt-field (parse-string line 171 179)) ; extract_date | |
(opt-field (parse-string line 179 189)) ; personal_identifier | |
(opt-field (parse-string line 189 190)) ; case_disposed | |
(opt-field (parse-string line 190 193)))) ; court_type | |
(define (parse-case-key line) | |
(string-trim (substring line 2 18))) | |
(define (parse-name str) | |
(define last-name #f) | |
(define first-name #f) | |
(define middle-name #f) | |
(define suffix #f) | |
(define tokens (string-split (string-replace str "\\" "") ",")) | |
(define len (length tokens)) | |
(when (> len 0) | |
(set! last-name (string-trim (car tokens)))) | |
(when (> len 1) | |
(set! first-name (string-trim (cadr tokens)))) | |
(when (> len 2) | |
(set! middle-name (string-trim (caddr tokens)))) | |
(when (> len 3) | |
(set! suffix (string-trim (cadddr tokens)))) | |
(values last-name first-name middle-name suffix)) | |
(define (opt-field str) | |
(if (non-empty-string? str) | |
str | |
"\\N")) | |
(define (parse-string line beg end) | |
(string-trim (string-replace (substring line beg end) "\\" ""))) | |
(define (log-error proc-sym line-no str) | |
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str)) | |
;; -------------------------------------------------------------------------------------------- | |
;; Support for Soundex | |
;; -------------------------------------------------------------------------------------------- | |
; Returns a new string where runs of the same | |
; character are replaced by a single character. | |
; (string-squeeze "abbaacdccee") -> "abacdce" | |
(define (string-squeeze str) | |
(define result (make-string (string-length str))) | |
(define last null) | |
(define i -1) | |
(for ([c (in-string str)]) | |
(when (not (eqv? c last)) | |
(set! i (+ i 1)) | |
(set! last c) | |
(string-set! result i c))) | |
(substring result 0 (+ i 1))) | |
(define a-code (char->integer #\A)) | |
(define (string-tr str map) | |
(define result (make-string (string-length str))) | |
(for ([c (in-string str)] | |
[i (in-naturals)]) | |
(let ([k (- (char->integer c) a-code)]) | |
(if (<= 0 k 25) | |
(string-set! result i (vector-ref map k)) | |
(string-set! result i c)))) | |
result) | |
;; -------------------------------------------------------------------------------------------- | |
;; Soundex | |
;; -------------------------------------------------------------------------------------------- | |
(define soundex-chars "ABCDEFGHIJKLMNOPQRSTUVWXYZ") | |
(define soundex-codes "A123E12HI22455O12623U1W2Y2") | |
(define tr-map | |
(let ([base (char->integer #\A)] | |
[v (make-vector 26)]) | |
(for ([c1 (in-string soundex-chars)] | |
[c2 (in-string soundex-codes)]) | |
(vector-set! v (- (char->integer c1) base) c2)) | |
v)) | |
(define (soundex original) | |
;; Convert to upper case. Also provides a copy to avoid modifying original | |
(define result (make-string (string-length original))) | |
(define dst -1) | |
(for ([c (in-string original)]) | |
(when (or (char-alphabetic? c)) | |
(set! dst (+ dst 1)) | |
(string-set! result dst (char-upcase c)))) | |
(define str1 (substring result 0 (+ dst 1))) | |
(if (non-empty-string? str1) | |
(soundex2 str1) | |
"")) | |
(define (soundex2 str1) | |
;; 1. Save the first letter. | |
(define first_letter (string-ref str1 0)) | |
;; 2. Remove all occurrences of 'h' and 'w' EXCEPT first letter. | |
(define len (string-length str1)) | |
(define str2 (string-append | |
(substring str1 0 1) | |
(string-replace (if (> len 1) | |
(substring str1 1 len) | |
"") | |
#rx"[HW]" ""))) | |
;; 3. Replace all consonants (including the first letter) with digits as follows: | |
;; b, f, p, v -> 1 | |
;; c, g, j, k, q, s, x, z -> 2 | |
;; d, t -> 3 | |
;; l -> 4 | |
;; m, n -> 5 | |
;; r -> 6 | |
(define str3 (string-tr str2 tr-map)) | |
;; 4. Replace all adjacent same digits with one digit. | |
(define str4 (string-squeeze str3)) | |
;; 5. Remove all occurrences of a, e, i, o, u, y EXCEPT first letter. | |
(define len2 (string-length str4)) | |
(define str5(string-append | |
(substring str4 0 1) | |
(string-replace (if (> len2 1) | |
(substring str4 1 len2) | |
"") | |
#rx"[AEIOUY]" ""))) | |
;; 6. If first symbol is a digit replace it with letter saved on step 1. | |
(when (char-numeric? (string-ref str5 0)) | |
(string-set! str5 0 first_letter)) | |
;; 7. Append zeros or truncate so that the result is a letter followed by 3 digits. | |
(define len3 (string-length str5)) | |
(cond [(< len3 4) (substring (string-append str5 "000") 0 4)] | |
[(> len3 4) (substring str5 0 4)] | |
[else str5])) | |
(define eof-str (string #\u001A)) | |
(define (run) | |
(parse-file "cr-200" | |
"e1-file" | |
"a1-file")) | |
;(profile-thunk run #:use-errortrace? #t) | |
(time (run)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment