Skip to content

Instantly share code, notes, and snippets.

@edbond
Last active December 31, 2015 22:18
Show Gist options
  • Save edbond/8052305 to your computer and use it in GitHub Desktop.
Save edbond/8052305 to your computer and use it in GitHub Desktop.
SGS instaparse
(ns sgs.core
(:require [instaparse.core :as p]
[clojure.pprint :refer (pprint)]))
;; http://stackoverflow.com/questions/20654883/whats-an-elegant-way-to-parse-this-data-format-in-clojure
(def sgs
(p/parser
"
<SGS> = (<COMMENT_ROW> | ROW)+
<NL> = '\\n'
<qq> = \"''\"
space = <#'\\s*'>
COMMENT_ROW = COMMENT NL?
LABEL = 'LAB' #'\\d+'
EMPTY_F = <space>
FFIELD = 'F' #'[0-9A-Z]+'
QFIELD = (<qq> (!qq #'.')+ <qq>)
<F> = FFIELD / QFIELD / EMPTY_F
F_SEP = ((space? | ',')* ';' NL space?) / (<space?> ',' <space?>) / <space>
<NEXT_FIELDS> = F <space?> (<F_SEP> NEXT_FIELDS)? <space?>
FIELDS = F <space?> (<F_SEP> NEXT_FIELDS)? <space?>
COMMENT = '.' #'.*'
ROW = LABEL <space?> FIELDS <space?> <COMMENT?> <NL?>
"))
(def example-input
". Comment
.
LAB1 F1S1 . Minimal data row, with line comment
LAB1 F1S1,F1S2,F1S3 F2S1 F3S1 . 2nd row with same label
LAB2 , , , F1S4 ''Field #2 (only 1 subfield)'' F3S1,,F3S3
LAB99 F1S1, . Field 1 has 2 subfields, 2nd is nil
LAB3 F1S1,F1S2, ;
F1S3 ;
F2S1 . Row continued over 3 lines. ")
(defn sgs-partition [fields]
(let [group-name #(some (fn group-name-lookup [f]
(re-find #"F\d" (str f))) %)]
(reduce (fn reduce-fields [acc f]
(let [current-group-name (group-name (last acc))
field-name (re-find #"F\d" (str f))]
(cond
(or
(every? nil? (last acc))
(nil? f)
(= current-group-name field-name))
;; append to last group
(assoc acc (dec (count acc)) (conj (last acc) f))
:else
;; create a new group
(assoc acc (count acc) [f]))))
[[]] fields)))
(def transform-rules
{:FFIELD str
:QFIELD str
:FIELDS (fn [& f] (sgs-partition (flatten f)))
:EMPTY_F (fn [] nil)
:LABEL str
:ROW (fn [& a] a)})
(defn parse-and-transform
[parser in]
(->> (parser in)
(p/transform transform-rules)
;; splat fields vector
(map (fn [[lbl fields]] (concat [lbl] fields)))
vec))
;; sgs.core> (time (dotimes [_ 100] (sgs example-input)))
;; "Elapsed time: 3857.030094 msecs"
sgs.core> (pprint (parse-and-transform sgs example-input))
[("LAB1" ["F1S1"])
("LAB1" ["F1S1" "F1S2" "F1S3"] ["F2S1"] ["F3S1"])
("LAB2"
[nil nil nil "F1S4"]
["Field #2 (only 1 subfield)"]
["F3S1" nil "F3S3"])
("LAB99" ["F1S1" nil])
("LAB3" ["F1S1" "F1S2" "F1S3"] ["F2S1"])]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment