Skip to content

Instantly share code, notes, and snippets.

@lojic

lojic/parser.rkt Secret

Created January 17, 2016 02:46
Show Gist options
  • Save lojic/f306104846d516761952 to your computer and use it in GitHub Desktop.
Save lojic/f306104846d516761952 to your computer and use it in GitHub Desktop.
#lang racket
;(require profile)
(define (parse-file ipath epath apath)
(call-with-input-file ipath
(λ (i-in)
(call-with-output-file epath
(λ (e-out)
(call-with-output-file apath
(λ (a-out)
(parse-records i-in e-out a-out))
#:exists 'replace))
#:exists 'replace))))
(define (parse-records i-in e-out a-out)
(define line-no 1)
(define num-deletes 0)
(define num-cases 0)
(define num-aliases 0)
;; Extract header
(fprintf e-out
"~a\n"
"copy extracts (case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,extract_date,personal_identifier,case_disposed,court_type) from stdin;")
;; SubjectAlias header
(fprintf a-out
"~a\n"
"copy subject_aliases (case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,middle_name,suffix) from stdin;")
(for ([line (in-lines i-in)])
(define record-type (substring line 0 2))
(match record-type
["00" null ]
["01" (parse-case e-out line-no line)
(set! num-cases (+ num-cases 1)) ]
["02" (parse-alias a-out line-no line)
(set! num-aliases (+ num-aliases 1)) ]
["99" null ]
[eof-str (log-error parse-records line-no "premature end of file") ]
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ])
(set! line-no (+ line-no 1)))
(fprintf e-out "\\.\n")
(fprintf a-out "\\.\n"))
(define (parse-alias a-out line-no line)
;; Only use 27 because the '@' character is in the last position!
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 20 47)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(fprintf a-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n"
(opt-field (parse-case-key line))
(string->number (substring line 18 20))
(opt-field last-name)
(opt-field last-name-soundex)
(opt-field first-name)
(opt-field first-name-soundex)
(opt-field middle-name)
(opt-field suffix)))
(define (parse-case e-out line-no line)
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 30 58)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(fprintf e-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n"
(opt-field (parse-case-key line))
(opt-field (parse-string line 18 30)) ; county name
(opt-field last-name)
(opt-field last-name-soundex)
(opt-field first-name)
(opt-field first-name-soundex)
(opt-field middle-name)
(opt-field suffix)
(opt-field (parse-string line 58 78)) ; addr_street1
(opt-field (parse-string line 78 93)) ; addr_street2
(opt-field (parse-string line 93 108)) ; city
(opt-field (parse-string line 108 110)) ; state
(opt-field (parse-string line 110 115)) ; zip5
(opt-field (parse-string line 115 119)) ; zip4
(opt-field (parse-string line 119 120)) ; race
(opt-field (parse-string line 120 121)) ; sex
(opt-field (parse-string line 121 125)) ; birth_year
(opt-field (parse-string line 125 127)) ; birth_month
(opt-field (parse-string line 127 129)) ; birth_day
(opt-field (parse-string line 129 133)) ; last_4_ssn
(opt-field (parse-string line 133 153)) ; driver_license
(opt-field (parse-string line 153 155)) ; driver_license_state
(opt-field (parse-string line 155 163)) ; citation_number
(opt-field (parse-string line 163 171)) ; added_to_acis_date
(opt-field (parse-string line 171 179)) ; extract_date
(opt-field (parse-string line 179 189)) ; personal_identifier
(opt-field (parse-string line 189 190)) ; case_disposed
(opt-field (parse-string line 190 193)))) ; court_type
(define (parse-case-key line)
(string-trim (substring line 2 18)))
(define (parse-name str)
(define last-name #f)
(define first-name #f)
(define middle-name #f)
(define suffix #f)
(define tokens (string-split (string-replace str "\\" "") ","))
(define len (length tokens))
(when (> len 0)
(set! last-name (string-trim (car tokens))))
(when (> len 1)
(set! first-name (string-trim (cadr tokens))))
(when (> len 2)
(set! middle-name (string-trim (caddr tokens))))
(when (> len 3)
(set! suffix (string-trim (cadddr tokens))))
(values last-name first-name middle-name suffix))
(define (opt-field str)
(if (non-empty-string? str)
str
"\\N"))
(define (parse-string line beg end)
(string-trim (string-replace (substring line beg end) "\\" "")))
(define (log-error proc-sym line-no str)
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str))
;; --------------------------------------------------------------------------------------------
;; Support for Soundex
;; --------------------------------------------------------------------------------------------
; Returns a new string where runs of the same
; character are replaced by a single character.
; (string-squeeze "abbaacdccee") -> "abacdce"
(define (string-squeeze str)
(define result (make-string (string-length str)))
(define last null)
(define i -1)
(for ([c (in-string str)])
(when (not (eqv? c last))
(set! i (+ i 1))
(set! last c)
(string-set! result i c)))
(substring result 0 (+ i 1)))
(define a-code (char->integer #\A))
(define (string-tr str map)
(define result (make-string (string-length str)))
(for ([c (in-string str)]
[i (in-naturals)])
(let ([k (- (char->integer c) a-code)])
(if (<= 0 k 25)
(string-set! result i (vector-ref map k))
(string-set! result i c))))
result)
;; --------------------------------------------------------------------------------------------
;; Soundex
;; --------------------------------------------------------------------------------------------
(define soundex-chars "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
(define soundex-codes "A123E12HI22455O12623U1W2Y2")
(define tr-map
(let ([base (char->integer #\A)]
[v (make-vector 26)])
(for ([c1 (in-string soundex-chars)]
[c2 (in-string soundex-codes)])
(vector-set! v (- (char->integer c1) base) c2))
v))
(define (soundex original)
;; Convert to upper case. Also provides a copy to avoid modifying original
(define result (make-string (string-length original)))
(define dst -1)
(for ([c (in-string original)])
(when (or (char-alphabetic? c))
(set! dst (+ dst 1))
(string-set! result dst (char-upcase c))))
(define str1 (substring result 0 (+ dst 1)))
(if (non-empty-string? str1)
(soundex2 str1)
""))
(define (soundex2 str1)
;; 1. Save the first letter.
(define first_letter (string-ref str1 0))
;; 2. Remove all occurrences of 'h' and 'w' EXCEPT first letter.
(define len (string-length str1))
(define str2 (string-append
(substring str1 0 1)
(string-replace (if (> len 1)
(substring str1 1 len)
"")
#rx"[HW]" "")))
;; 3. Replace all consonants (including the first letter) with digits as follows:
;; b, f, p, v -> 1
;; c, g, j, k, q, s, x, z -> 2
;; d, t -> 3
;; l -> 4
;; m, n -> 5
;; r -> 6
(define str3 (string-tr str2 tr-map))
;; 4. Replace all adjacent same digits with one digit.
(define str4 (string-squeeze str3))
;; 5. Remove all occurrences of a, e, i, o, u, y EXCEPT first letter.
(define len2 (string-length str4))
(define str5(string-append
(substring str4 0 1)
(string-replace (if (> len2 1)
(substring str4 1 len2)
"")
#rx"[AEIOUY]" "")))
;; 6. If first symbol is a digit replace it with letter saved on step 1.
(when (char-numeric? (string-ref str5 0))
(string-set! str5 0 first_letter))
;; 7. Append zeros or truncate so that the result is a letter followed by 3 digits.
(define len3 (string-length str5))
(cond [(< len3 4) (substring (string-append str5 "000") 0 4)]
[(> len3 4) (substring str5 0 4)]
[else str5]))
(define eof-str (string #\u001A))
(define (run)
(parse-file "cr-200"
"e1-file"
"a1-file"))
;(profile-thunk run #:use-errortrace? #t)
(time (run))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment