Skip to content

Instantly share code, notes, and snippets.

@lojic

lojic/parser.rkt Secret

Last active November 20, 2015 04:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lojic/1deba97f2e2eb2fe3fc0 to your computer and use it in GitHub Desktop.
Save lojic/1deba97f2e2eb2fe3fc0 to your computer and use it in GitHub Desktop.
#lang racket
(require "./phonetic.rkt")
(require "./string-helper.rkt")
(require profile)
(define eof-str (string #\u001A))
(define (log-error proc-sym line-no str)
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str))
(define (run)
(parse-file "pg_bench/cr-100"
"pg_bench/e-file"
"pg_bench/a-file"))
(define (parse-file ipath epath apath)
(call-with-input-file ipath
(λ (i-in)
(call-with-output-file epath
(λ (e-out)
(call-with-output-file apath
(λ (a-out)
(parse-records i-in e-out a-out))))))))
(define (parse-records i-in e-out a-out)
(define line-no 1)
(define num-deletes 0)
(define num-cases 0)
(define num-aliases 0)
(fprintf e-out "~a\n" #<<EOF
copy extracts
(case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex,
middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city,
addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day,
last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date,
extract_date,personal_identifier,case_disposed,court_type) from stdin;
EOF
)
(fprintf a-out "~a\n" #<<EOF
copy subject_aliases
(case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex,
middle_name,suffix) from stdin;
EOF
)
(for ([line (in-lines i-in)])
(define record-type (substring line 0 2))
(match record-type
["00" null ] ;(parse-delete line-no line)
;(set! num-deletes (+ num-deletes 1)) ]
["01" (parse-case e-out line-no line)
(set! num-cases (+ num-cases 1)) ]
["02" (parse-alias a-out line-no line)
(set! num-aliases (+ num-aliases 1)) ]
["99" (parse-eof line-no line) ]
[eof-str (log-error parse-records line-no "premature end of file") ]
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ])
(set! line-no (+ line-no 1))))
(define (parse-alias a-out line-no line)
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 20 47)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(fprintf a-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\n" (parse-case-key line)
(opt-field last-name)
(opt-field last-name-soundex)
(opt-field first-name)
(opt-field first-name-soundex)
(opt-field middle-name)
(opt-field suffix)))
(define (parse-integer str)
(string->number str))
(define (parse-case e-out line-no line)
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 20 47)))
(define last-name-soundex (if last-name (soundex last-name) #f))
(define first-name-soundex (if first-name (soundex first-name) #f))
(fprintf e-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n"
(parse-case-key line)
(opt-field (substring line 18 30)) ; county name
(opt-field last-name)
(opt-field last-name-soundex)
(opt-field first-name)
(opt-field first-name-soundex)
(opt-field middle-name)
(opt-field suffix)
(opt-field (substring line 58 78)) ; addr_street1
(opt-field (substring line 78 93)) ; addr_street2
(opt-field (substring line 93 108)) ; city
(opt-field (substring line 108 110)) ; state
(opt-field (substring line 110 115)) ; zip5
(opt-field (substring line 115 119)) ; zip4
(opt-field (substring line 119 120)) ; race
(opt-field (substring line 120 121))
(opt-field (parse-integer (substring line 121 125))) ; birth_year
(opt-field (parse-integer (substring line 125 127))) ; birth_month
(opt-field (parse-integer (substring line 127 129))) ; birth_day
(opt-field (substring line 129 133)) ; last_4_ssn
(opt-field (substring line 133 153)) ; driver_license
(opt-field (substring line 153 155)) ; driver_license_state
(opt-field (substring line 155 163)) ; citation_number
(opt-field (substring line 163 171)) ; added_to_acis_date
(opt-field (substring line 171 179)) ; extract_date
(opt-field (substring line 179 189)) ; personal_identifier
(opt-field (substring line 189 190)) ; case_disposed
(opt-field (substring line 190 193)))) ; court_type
(define (parse-delete line-no line) "")
(define (parse-eof line-no line) "")
(define (parse-case-key line)
(string-trim (substring line 2 18)))
(define (opt-field val)
(if val val "\\N"))
(define (parse-name str)
(match (string-split (string-trim (string-replace str "\\" "")) ",")
[ (list last first middle suffix) (values last first middle suffix) ]
[ (list last first middle) (values last first middle #f) ]
[ (list last first) (values last first #f #f) ]
[ (list last) (values last #f #f #f) ]
[ _ (values #f #f #f #f) ] ))
(define (pipe . fs)
(apply compose (reverse fs)))
;(profile-thunk run #:use-errortrace? #t)
(time (run))
#lang racket
(require "./string-helper.rkt")
(define soundex-chars "AEIOUYBFPVCGJKQSXZDTLMNR")
(define soundex-non-code (pregexp (format "[^~a]" soundex-chars)))
(define soundex-non-alpha (pregexp "[^A-Z]"))
(define a-code (char->integer #\A))
(define z-code (char->integer #\Z))
(define tr-map
(let ([base (char->integer #\A)]
[v (make-vector 26)])
(for ([c1 (in-string soundex-chars)]
[c2 (in-string "000000111122222222334556")])
(vector-set! v (- (char->integer c1) base) c2))
v))
(define (non-upcase-alpha str)
(define len (string-length str))
(define result (make-string len))
(define i -1)
(for ([c (in-string str)])
(let ([code (char->integer c)])
(when (and (>= code a-code) (<= code z-code))
(set! i (+ i 1))
(string-set! result i c))))
(substring result 0 (+ i 1)))
(define (remove-zeros str)
(define len (string-length str))
(define result (make-string len))
(define i -1)
(for ([c (in-string str)])
(when (not (eqv? c #\0))
(set! i (+ i 1))
(string-set! result i c)))
(substring result 0 (+ i 1)))
(define (soundex original)
(define upstr (string-upcase original))
(define str (non-upcase-alpha upstr))
(if (string-empty? str)
#f
(let ([first (~r (- (char->integer (string-ref str 0)) 55)
#:min-width 2 #:pad-string "0") ]
[rest (string-ljust
(remove-zeros (string-squeeze (string-tr (string-replace (string-cdr str)
soundex-non-code
"")
tr-map)))
3 #\0)])
(string-append first rest))))
(provide soundex)
#lang racket
(define (string-cdr str)
(substring str 1 (string-length str)))
(define (string-empty? str)
(< (string-length str) 1))
(define (string-ljust str min pad)
(let ([len (string-length str)])
(if (> len 2)
str
(string-append str (make-string (- 3 len) pad)))))
; Returns a new string where runs of the same
; character are replaced by a single character.
; (string-squeeze "abbaacdccee") -> "abacdce"
(define (string-squeeze str)
(define result (make-string (string-length str)))
(define last null)
(define i -1)
(for ([c (in-string str)])
(when (not (eqv? c last))
(set! i (+ i 1))
(set! last c)
(string-set! result i c)))
(substring result 0 (+ i 1)))
(define a-code (char->integer #\A))
(define (string-tr str map)
(define result (make-string (string-length str)))
(for ([c (in-string str)]
[i (in-naturals)])
(string-set! result i (vector-ref map (- (char->integer c) a-code))))
result)
(define (bench-it f n)
(time
(for ([i (in-range n)])
(f "aaabbcdddeefgggghhhhhhhhiiijjiiijjijwwwwwww"))))
(provide string-cdr string-empty? string-ljust string-squeeze string-tr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment