Last active
December 31, 2015 22:18
-
-
Save edbond/8052305 to your computer and use it in GitHub Desktop.
SGS instaparse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns sgs.core | |
(:require [instaparse.core :as p] | |
[clojure.pprint :refer (pprint)])) | |
;; http://stackoverflow.com/questions/20654883/whats-an-elegant-way-to-parse-this-data-format-in-clojure | |
(def sgs | |
(p/parser | |
" | |
<SGS> = (<COMMENT_ROW> | ROW)+ | |
<NL> = '\\n' | |
<qq> = \"''\" | |
space = <#'\\s*'> | |
COMMENT_ROW = COMMENT NL? | |
LABEL = 'LAB' #'\\d+' | |
EMPTY_F = <space> | |
FFIELD = 'F' #'[0-9A-Z]+' | |
QFIELD = (<qq> (!qq #'.')+ <qq>) | |
<F> = FFIELD / QFIELD / EMPTY_F | |
F_SEP = ((space? | ',')* ';' NL space?) / (<space?> ',' <space?>) / <space> | |
<NEXT_FIELDS> = F <space?> (<F_SEP> NEXT_FIELDS)? <space?> | |
FIELDS = F <space?> (<F_SEP> NEXT_FIELDS)? <space?> | |
COMMENT = '.' #'.*' | |
ROW = LABEL <space?> FIELDS <space?> <COMMENT?> <NL?> | |
")) | |
(def example-input | |
". Comment | |
. | |
LAB1 F1S1 . Minimal data row, with line comment | |
LAB1 F1S1,F1S2,F1S3 F2S1 F3S1 . 2nd row with same label | |
LAB2 , , , F1S4 ''Field #2 (only 1 subfield)'' F3S1,,F3S3 | |
LAB99 F1S1, . Field 1 has 2 subfields, 2nd is nil | |
LAB3 F1S1,F1S2, ; | |
F1S3 ; | |
F2S1 . Row continued over 3 lines. ") | |
(defn sgs-partition [fields] | |
(let [group-name #(some (fn group-name-lookup [f] | |
(re-find #"F\d" (str f))) %)] | |
(reduce (fn reduce-fields [acc f] | |
(let [current-group-name (group-name (last acc)) | |
field-name (re-find #"F\d" (str f))] | |
(cond | |
(or | |
(every? nil? (last acc)) | |
(nil? f) | |
(= current-group-name field-name)) | |
;; append to last group | |
(assoc acc (dec (count acc)) (conj (last acc) f)) | |
:else | |
;; create a new group | |
(assoc acc (count acc) [f])))) | |
[[]] fields))) | |
(def transform-rules | |
{:FFIELD str | |
:QFIELD str | |
:FIELDS (fn [& f] (sgs-partition (flatten f))) | |
:EMPTY_F (fn [] nil) | |
:LABEL str | |
:ROW (fn [& a] a)}) | |
(defn parse-and-transform | |
[parser in] | |
(->> (parser in) | |
(p/transform transform-rules) | |
;; splat fields vector | |
(map (fn [[lbl fields]] (concat [lbl] fields))) | |
vec)) | |
;; sgs.core> (time (dotimes [_ 100] (sgs example-input))) | |
;; "Elapsed time: 3857.030094 msecs" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sgs.core> (pprint (parse-and-transform sgs example-input)) | |
[("LAB1" ["F1S1"]) | |
("LAB1" ["F1S1" "F1S2" "F1S3"] ["F2S1"] ["F3S1"]) | |
("LAB2" | |
[nil nil nil "F1S4"] | |
["Field #2 (only 1 subfield)"] | |
["F3S1" nil "F3S3"]) | |
("LAB99" ["F1S1" nil]) | |
("LAB3" ["F1S1" "F1S2" "F1S3"] ["F2S1"])] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment