-
-
Save lojic/1deba97f2e2eb2fe3fc0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require "./phonetic.rkt") | |
(require "./string-helper.rkt") | |
(require profile) | |
(define eof-str (string #\u001A)) | |
(define (log-error proc-sym line-no str) | |
(printf "*error* Parser:~a line ~a: ~a" (object-name proc-sym) line-no str)) | |
(define (run) | |
(parse-file "pg_bench/cr-100" | |
"pg_bench/e-file" | |
"pg_bench/a-file")) | |
(define (parse-file ipath epath apath) | |
(call-with-input-file ipath | |
(λ (i-in) | |
(call-with-output-file epath | |
(λ (e-out) | |
(call-with-output-file apath | |
(λ (a-out) | |
(parse-records i-in e-out a-out)))))))) | |
(define (parse-records i-in e-out a-out) | |
(define line-no 1) | |
(define num-deletes 0) | |
(define num-cases 0) | |
(define num-aliases 0) | |
(fprintf e-out "~a\n" #<<EOF | |
copy extracts | |
(case_key,county_name,last_name,last_name_soundex,first_name,first_name_soundex, | |
middle_name,middle_name_soundex,suffix,addr_street1,addr_street2,addr_city, | |
addr_state,addr_zip5,addr_zip4,race,sex,birth_year,birth_month,birth_day, | |
last_4_ssn,driver_license,driver_license_state,citation_number,added_to_acis_date, | |
extract_date,personal_identifier,case_disposed,court_type) from stdin; | |
EOF | |
) | |
(fprintf a-out "~a\n" #<<EOF | |
copy subject_aliases | |
(case_key,line_number,last_name,last_name_soundex,first_name,first_name_soundex, | |
middle_name,suffix) from stdin; | |
EOF | |
) | |
(for ([line (in-lines i-in)]) | |
(define record-type (substring line 0 2)) | |
(match record-type | |
["00" null ] ;(parse-delete line-no line) | |
;(set! num-deletes (+ num-deletes 1)) ] | |
["01" (parse-case e-out line-no line) | |
(set! num-cases (+ num-cases 1)) ] | |
["02" (parse-alias a-out line-no line) | |
(set! num-aliases (+ num-aliases 1)) ] | |
["99" (parse-eof line-no line) ] | |
[eof-str (log-error parse-records line-no "premature end of file") ] | |
[_ (log-error parse-records line-no (format "invalid record type: ~a" record-type)) ]) | |
(set! line-no (+ line-no 1)))) | |
(define (parse-alias a-out line-no line) | |
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 20 47))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(fprintf a-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\n" (parse-case-key line) | |
(opt-field last-name) | |
(opt-field last-name-soundex) | |
(opt-field first-name) | |
(opt-field first-name-soundex) | |
(opt-field middle-name) | |
(opt-field suffix))) | |
(define (parse-integer str) | |
(string->number str)) | |
(define (parse-case e-out line-no line) | |
(define-values (last-name first-name middle-name suffix) (parse-name (substring line 20 47))) | |
(define last-name-soundex (if last-name (soundex last-name) #f)) | |
(define first-name-soundex (if first-name (soundex first-name) #f)) | |
(fprintf e-out "~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\t~a\n" | |
(parse-case-key line) | |
(opt-field (substring line 18 30)) ; county name | |
(opt-field last-name) | |
(opt-field last-name-soundex) | |
(opt-field first-name) | |
(opt-field first-name-soundex) | |
(opt-field middle-name) | |
(opt-field suffix) | |
(opt-field (substring line 58 78)) ; addr_street1 | |
(opt-field (substring line 78 93)) ; addr_street2 | |
(opt-field (substring line 93 108)) ; city | |
(opt-field (substring line 108 110)) ; state | |
(opt-field (substring line 110 115)) ; zip5 | |
(opt-field (substring line 115 119)) ; zip4 | |
(opt-field (substring line 119 120)) ; race | |
(opt-field (substring line 120 121)) | |
(opt-field (parse-integer (substring line 121 125))) ; birth_year | |
(opt-field (parse-integer (substring line 125 127))) ; birth_month | |
(opt-field (parse-integer (substring line 127 129))) ; birth_day | |
(opt-field (substring line 129 133)) ; last_4_ssn | |
(opt-field (substring line 133 153)) ; driver_license | |
(opt-field (substring line 153 155)) ; driver_license_state | |
(opt-field (substring line 155 163)) ; citation_number | |
(opt-field (substring line 163 171)) ; added_to_acis_date | |
(opt-field (substring line 171 179)) ; extract_date | |
(opt-field (substring line 179 189)) ; personal_identifier | |
(opt-field (substring line 189 190)) ; case_disposed | |
(opt-field (substring line 190 193)))) ; court_type | |
(define (parse-delete line-no line) "") | |
(define (parse-eof line-no line) "") | |
(define (parse-case-key line) | |
(string-trim (substring line 2 18))) | |
(define (opt-field val) | |
(if val val "\\N")) | |
(define (parse-name str) | |
(match (string-split (string-trim (string-replace str "\\" "")) ",") | |
[ (list last first middle suffix) (values last first middle suffix) ] | |
[ (list last first middle) (values last first middle #f) ] | |
[ (list last first) (values last first #f #f) ] | |
[ (list last) (values last #f #f #f) ] | |
[ _ (values #f #f #f #f) ] )) | |
(define (pipe . fs) | |
(apply compose (reverse fs))) | |
;(profile-thunk run #:use-errortrace? #t) | |
(time (run)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require "./string-helper.rkt") | |
(define soundex-chars "AEIOUYBFPVCGJKQSXZDTLMNR") | |
(define soundex-non-code (pregexp (format "[^~a]" soundex-chars))) | |
(define soundex-non-alpha (pregexp "[^A-Z]")) | |
(define a-code (char->integer #\A)) | |
(define z-code (char->integer #\Z)) | |
(define tr-map | |
(let ([base (char->integer #\A)] | |
[v (make-vector 26)]) | |
(for ([c1 (in-string soundex-chars)] | |
[c2 (in-string "000000111122222222334556")]) | |
(vector-set! v (- (char->integer c1) base) c2)) | |
v)) | |
(define (non-upcase-alpha str) | |
(define len (string-length str)) | |
(define result (make-string len)) | |
(define i -1) | |
(for ([c (in-string str)]) | |
(let ([code (char->integer c)]) | |
(when (and (>= code a-code) (<= code z-code)) | |
(set! i (+ i 1)) | |
(string-set! result i c)))) | |
(substring result 0 (+ i 1))) | |
(define (remove-zeros str) | |
(define len (string-length str)) | |
(define result (make-string len)) | |
(define i -1) | |
(for ([c (in-string str)]) | |
(when (not (eqv? c #\0)) | |
(set! i (+ i 1)) | |
(string-set! result i c))) | |
(substring result 0 (+ i 1))) | |
(define (soundex original) | |
(define upstr (string-upcase original)) | |
(define str (non-upcase-alpha upstr)) | |
(if (string-empty? str) | |
#f | |
(let ([first (~r (- (char->integer (string-ref str 0)) 55) | |
#:min-width 2 #:pad-string "0") ] | |
[rest (string-ljust | |
(remove-zeros (string-squeeze (string-tr (string-replace (string-cdr str) | |
soundex-non-code | |
"") | |
tr-map))) | |
3 #\0)]) | |
(string-append first rest)))) | |
(provide soundex) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(define (string-cdr str) | |
(substring str 1 (string-length str))) | |
(define (string-empty? str) | |
(< (string-length str) 1)) | |
(define (string-ljust str min pad) | |
(let ([len (string-length str)]) | |
(if (> len 2) | |
str | |
(string-append str (make-string (- 3 len) pad))))) | |
; Returns a new string where runs of the same | |
; character are replaced by a single character. | |
; (string-squeeze "abbaacdccee") -> "abacdce" | |
(define (string-squeeze str) | |
(define result (make-string (string-length str))) | |
(define last null) | |
(define i -1) | |
(for ([c (in-string str)]) | |
(when (not (eqv? c last)) | |
(set! i (+ i 1)) | |
(set! last c) | |
(string-set! result i c))) | |
(substring result 0 (+ i 1))) | |
(define a-code (char->integer #\A)) | |
(define (string-tr str map) | |
(define result (make-string (string-length str))) | |
(for ([c (in-string str)] | |
[i (in-naturals)]) | |
(string-set! result i (vector-ref map (- (char->integer c) a-code)))) | |
result) | |
(define (bench-it f n) | |
(time | |
(for ([i (in-range n)]) | |
(f "aaabbcdddeefgggghhhhhhhhiiijjiiijjijwwwwwww")))) | |
(provide string-cdr string-empty? string-ljust string-squeeze string-tr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment