Skip to content

Instantly share code, notes, and snippets.

@sogaiu
Last active September 14, 2020 00:58
Show Gist options
  • Save sogaiu/2c0cdfc2193474679651f6fc8a3a49c0 to your computer and use it in GitHub Desktop.
Save sogaiu/2c0cdfc2193474679651f6fc8a3a49c0 to your computer and use it in GitHub Desktop.
carocad's parcera instaparse grammar updated
(ns parcera.core
(:require [instaparse.core :as instaparse]
[instaparse.combinators-source :as combi]
[instaparse.cfg :as cfg]
[parcera.terminals :as terminal])
#?(:cljs (:import goog.string.StringBuffer)))
; todo: implement advices from
; http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html
; https://www.loggly.com/blog/regexes-the-bad-better-best/
; https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/
; todo: use advices in https://medium.appbase.io/analyzing-20k-github-repositories-af76de21c3fc
; to check if the heuristics are accurate
; NOTE: Through my experiments I found out that Instaparse will gladly take the
; first match as long as the grammar is not ambiguous. Therefore I switched the
; unordered OR (|) with an ordered one (/). This of course implies an heuristic
; of knowing which grammar rules are expected to match more often. I use
; Clojure's core as a reference with the following code snippet
#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojure/master/src/clj/clojure/core.clj")]
(time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory)))))))
#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojurescript/master/src/main/clojure/cljs/core.cljc")]
(time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory)))))))
; todo: performance of [,\s]*;.*|[,\s]+ for whitespace
(def grammar-rules
"code: input*;
<input>: ignore / form;
<ignore>: whitespace / comment / discard;
<form>: literal / collection / reader-macro;
(* for parsing purposes we dont consider a Set a collection since it starts
with # -> dispatch macro *)
<collection>: list / vector / map;
list: <'('> input* <')'>;
vector: <'['> input* <']'>;
map: <'{'> input* <'}'>;
(* a literal is basically anything that is not a collection, macro or whitespace *)
<literal>: ( keyword
/ macro-keyword
/ string
/ number
/ character
/ symbol
);
(* keyword: ; *)
(* macro-keyword: ; *)
(* string: ; *)
(* number: ; *)
(* character: ; *)
(* symbol: ; *)
<reader-macro>: ( unquote
/ metadata
/ backtick
/ quote
/ dispatch
/ unquote-splicing
/ deref
);
metadata: ((metadata-entry | deprecated-metadata-entry) ignore*)+
( symbol
/ collection
/ set
/ namespaced-map
/ tag
/ fn
/ unquote
/ unquote-splicing
/ conditional
/ deref
/ quote
/ backtick
/ var-quote
);
metadata-entry: <'^'> ignore* ( conditional
/ map
/ symbol
/ string
/ keyword
/ macro-keyword );
deprecated-metadata-entry: <'#^'> ignore* ( conditional
/ map
/ symbol
/ string
/ keyword
/ macro-keyword );
backtick: <'`'> ignore* form;
quote: <'\\''> ignore* form;
unquote: <#'~'> ignore* form;
unquote-splicing: <'~@'> ignore* form;
deref: <'@'> ignore* form;
<dispatch>: ( fn
/ regex
/ set
/ conditional
/ conditional-splicing
/ namespaced-map
/ var-quote
/ tag
/ symbolic
/ eval
);
fn: <'#'> list;
(* XXX: how to 'hide' and use regexes using combi? *)
regex: <'#'> #'\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"'
set: <'#{'> input* <'}'>;
namespaced-map: <'#'> ( keyword / macro-keyword / auto-resolve )
ignore*
map;
auto-resolve: <'::'>;
var-quote: <'#\\''> ignore* form;
discard: <'#_'> ignore* form;
tag: <'#'> symbol ignore* (literal / collection / tag);
conditional: <'#?'> ignore* list;
conditional-splicing: <'#?@'> ignore* list;
symbolic: <'##'> ignore* symbol;
eval: <'#='> ignore* (symbol / list);
(* whitespace: ; *)
(* comment: ; *)
")
(def grammar-terminals
{:character (combi/regexp terminal/character-pattern)
:string (combi/regexp terminal/string-pattern)
:symbol (combi/regexp terminal/symbol-pattern)
:number (combi/regexp terminal/number-pattern)
:macro-keyword (combi/regexp terminal/macro-keyword-pattern)
:keyword (combi/regexp terminal/keyword-pattern)
:whitespace (combi/regexp terminal/whitespace-pattern)
:comment (combi/regexp terminal/comment-pattern)})
(def grammar (merge (cfg/ebnf grammar-rules) grammar-terminals))
(def clojure
"Clojure (instaparse) parser. It can be used as:
- (parcera/clojure input-string)
-> returns an AST representation of input-string
- (instaparse/parse parcera/clojure input-string)
-> same as above but more explicit
- (instaparse/parses parcera/clojure input-string)
-> returns a sequence of possible AST representations in case of ambiguity
in input-string
For a description of all possible options, visit Instaparse's official
documentation: https://github.com/Engelberg/instaparse#reference"
(instaparse/parser grammar :start :code))
(defn- code*
"internal function used to imperatively build up the code from the provided
AST as Clojure's str would be too slow"
[ast #?(:clj ^StringBuilder string-builder
:cljs ^StringBuffer string-builder)]
(case (first ast)
:code
(doseq [child (rest ast)]
(code* child string-builder))
:list
(do (. string-builder (append "("))
(doseq [child (rest ast)] (code* child string-builder))
(. string-builder (append ")")))
:vector
(do (. string-builder (append "["))
(doseq [child (rest ast)] (code* child string-builder))
(. string-builder (append "]")))
:namespaced-map
(do (. string-builder (append "#"))
(doseq [child (rest ast)] (code* child string-builder)))
:map
(do (. string-builder (append "{"))
(doseq [child (rest ast)] (code* child string-builder))
(. string-builder (append "}")))
:set
(do (. string-builder (append "#{"))
(doseq [child (rest ast)] (code* child string-builder))
(. string-builder (append "}")))
(:number :whitespace :comment :symbol :character :string
:keyword :macro-keyword)
(. string-builder (append (second ast)))
:symbolic
(do (. string-builder (append "##"))
(. string-builder (append (second ast))))
:regex
(do (. string-builder (append "#"))
(. string-builder (append (second ast))))
:auto-resolve
(. string-builder (append "::"))
:metadata
(do (doseq [child (rest (butlast ast))] (code* child string-builder))
(code* (last ast) string-builder))
:metadata-entry
(do (. string-builder (append "^"))
(doseq [child (rest ast)] (code* child string-builder)))
:deprecated-metadata-entry
(do (. string-builder (append "#^"))
(doseq [child (rest ast)] (code* child string-builder)))
:quote
(do (. string-builder (append "'"))
(doseq [child (rest ast)] (code* child string-builder)))
:var-quote
(do (. string-builder (append "#'"))
(doseq [child (rest ast)] (code* child string-builder)))
:discard
(do (. string-builder (append "#_"))
(doseq [child (rest ast)] (code* child string-builder)))
:tag
(do (. string-builder (append "#"))
(doseq [child (rest ast)] (code* child string-builder)))
:backtick
(do (. string-builder (append "`"))
(doseq [child (rest ast)] (code* child string-builder)))
:unquote
(do (. string-builder (append "~"))
(doseq [child (rest ast)] (code* child string-builder)))
:unquote-splicing
(do (. string-builder (append "~@"))
(doseq [child (rest ast)] (code* child string-builder)))
:conditional
(do (. string-builder (append "#?"))
(doseq [child (rest ast)] (code* child string-builder)))
:conditional-splicing
(do (. string-builder (append "#?@"))
(doseq [child (rest ast)] (code* child string-builder)))
:deref
(do (. string-builder (append "@"))
(doseq [child (rest ast)] (code* child string-builder)))
:fn
(do (. string-builder (append "#"))
(doseq [child (rest ast)] (code* child string-builder)))
:eval
(do (. string-builder (append "#="))
(doseq [child (rest ast)] (code* child string-builder)))))
(defn code
"Transforms your AST back into code
ast: The nested sequence of [:keyword & content] which MUST follow the
same structure as the result of `(parcera/clojure input-string)`
Returns a string representation of the provided AST
In general (= input (parcera/code (parcera/clojure input)))"
[ast]
(let [string-builder #?(:clj (new StringBuilder)
:cljs (new StringBuffer))]
(code* ast string-builder)
(. string-builder (toString))))
; Successful parse.
; Profile: {:create-node 384, :push-full-listener 2, :push-stack 384,
; :push-listener 382, :push-result 227, :push-message 227 }
; "Elapsed time: 47.25084 msecs"
#_(time (clojure (str '(ns parcera.core
(:require [instaparse.core :as instaparse]
[clojure.data :as data]
[clojure.string :as str])))
:trace true))
#_(instaparse/disable-tracing!)
(comment
;; 4400 ms!
(let [start (System/currentTimeMillis)
src (slurp (str (System/getenv "HOME")
"/src/clojure/src/clj/clojure/core.clj"))
ast (try
(clojure src)
(catch Exception e
(println e)
nil))]
(when ast
(let [reconstructed (code ast)]
(when-not (= reconstructed src)
(spit "/tmp/failed-reconstruction.clj"
reconstructed)
(println "failed to reconstruct"))))
(println (- (System/currentTimeMillis) start) "ms"))
)
(ns parcera.terminals
"Clojure symbols, keywords, numbers and string/regex share quite a lot
of matching logic. This namespace is aimed towards clearly identifying
those pieces and share them among the different definitions to
avoid recurring issues")
;; Clojure's reader is quite permissive so we follow the motto
;; "if it is not forbidden, it is allowed"
(def not-allowed "\\f\\n\\r\\t \\(\\)\\[\\]{}\"@~\\^;`\\\\,:#\\'\\/")
(def allowed-character (str "[^" not-allowed "]"))
(def keyword-head (str "("
allowed-character "|" "[#']"
")"))
(def keyword-body (str "("
keyword-head "|" "[:\\/]"
")"))
(def keyword-pattern (str ":"
"("
, "(" keyword-head keyword-body "*" ")"
, "|"
, "\\/"
")"))
(def macro-keyword-pattern (str "::" keyword-head keyword-body "*"))
(def name-head (str "("
allowed-character "|" "[\\/]"
")"))
(def name-body (str "("
name-head "|" "[:#\\']"
")"))
(def symbol-pattern (str name-head name-body "*"))
(def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)")
(def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)")
(def ratio-suffix "(\\/(\\d+))")
;; XXX: order ok here?
(def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) ; todo: word boundary ?
; This is supposed to be the JavaScript friendly version of #'\P{M}\p{M}*+'
; mentioned here: https://www.regular-expressions.info/unicode.html
; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block
; ticking all 'Combining Diacritical Marks' boxes *))
(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF])")
(def named-char "(newline|return|space|tab|formfeed|backspace)")
(def unicode "(u[\\dA-Fa-f]{4})")
(def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)"))
(def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"")
;;(def regex-pattern (str "#" string-pattern))
(def whitespace-pattern "[\f\n\r\t, ]+")
(def comment-pattern "(;|(#!))[^\r\n]*")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment