Last active
September 14, 2020 00:58
-
-
Save sogaiu/2c0cdfc2193474679651f6fc8a3a49c0 to your computer and use it in GitHub Desktop.
carocad's parcera instaparse grammar updated
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns parcera.core | |
(:require [instaparse.core :as instaparse] | |
[instaparse.combinators-source :as combi] | |
[instaparse.cfg :as cfg] | |
[parcera.terminals :as terminal]) | |
#?(:cljs (:import goog.string.StringBuffer))) | |
; todo: implement advices from | |
; http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html | |
; https://www.loggly.com/blog/regexes-the-bad-better-best/ | |
; https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/ | |
; todo: use advices in https://medium.appbase.io/analyzing-20k-github-repositories-af76de21c3fc | |
; to check if the heuristics are accurate | |
; NOTE: Through my experiments I found out that Instaparse will gladly take the | |
; first match as long as the grammar is not ambiguous. Therefore I switched the | |
; unordered OR (|) with an ordered one (/). This of course implies an heuristic | |
; of knowing which grammar rules are expected to match more often. I use | |
; Clojure's core as a reference with the following code snippet | |
#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojure/master/src/clj/clojure/core.clj")] | |
(time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) | |
#_(let [core-content (slurp "https://raw.githubusercontent.com/clojure/clojurescript/master/src/main/clojure/cljs/core.cljc")] | |
(time (sort-by second > (frequencies (filter keyword? (flatten (clojure core-content :optimize :memory))))))) | |
; todo: performance of [,\s]*;.*|[,\s]+ for whitespace | |
(def grammar-rules | |
"code: input*; | |
<input>: ignore / form; | |
<ignore>: whitespace / comment / discard; | |
<form>: literal / collection / reader-macro; | |
(* for parsing purposes we dont consider a Set a collection since it starts | |
with # -> dispatch macro *) | |
<collection>: list / vector / map; | |
list: <'('> input* <')'>; | |
vector: <'['> input* <']'>; | |
map: <'{'> input* <'}'>; | |
(* a literal is basically anything that is not a collection, macro or whitespace *) | |
<literal>: ( keyword | |
/ macro-keyword | |
/ string | |
/ number | |
/ character | |
/ symbol | |
); | |
(* keyword: ; *) | |
(* macro-keyword: ; *) | |
(* string: ; *) | |
(* number: ; *) | |
(* character: ; *) | |
(* symbol: ; *) | |
<reader-macro>: ( unquote | |
/ metadata | |
/ backtick | |
/ quote | |
/ dispatch | |
/ unquote-splicing | |
/ deref | |
); | |
metadata: ((metadata-entry | deprecated-metadata-entry) ignore*)+ | |
( symbol | |
/ collection | |
/ set | |
/ namespaced-map | |
/ tag | |
/ fn | |
/ unquote | |
/ unquote-splicing | |
/ conditional | |
/ deref | |
/ quote | |
/ backtick | |
/ var-quote | |
); | |
metadata-entry: <'^'> ignore* ( conditional | |
/ map | |
/ symbol | |
/ string | |
/ keyword | |
/ macro-keyword ); | |
deprecated-metadata-entry: <'#^'> ignore* ( conditional | |
/ map | |
/ symbol | |
/ string | |
/ keyword | |
/ macro-keyword ); | |
backtick: <'`'> ignore* form; | |
quote: <'\\''> ignore* form; | |
unquote: <#'~'> ignore* form; | |
unquote-splicing: <'~@'> ignore* form; | |
deref: <'@'> ignore* form; | |
<dispatch>: ( fn | |
/ regex | |
/ set | |
/ conditional | |
/ conditional-splicing | |
/ namespaced-map | |
/ var-quote | |
/ tag | |
/ symbolic | |
/ eval | |
); | |
fn: <'#'> list; | |
(* XXX: how to 'hide' and use regexes using combi? *) | |
regex: <'#'> #'\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"' | |
set: <'#{'> input* <'}'>; | |
namespaced-map: <'#'> ( keyword / macro-keyword / auto-resolve ) | |
ignore* | |
map; | |
auto-resolve: <'::'>; | |
var-quote: <'#\\''> ignore* form; | |
discard: <'#_'> ignore* form; | |
tag: <'#'> symbol ignore* (literal / collection / tag); | |
conditional: <'#?'> ignore* list; | |
conditional-splicing: <'#?@'> ignore* list; | |
symbolic: <'##'> ignore* symbol; | |
eval: <'#='> ignore* (symbol / list); | |
(* whitespace: ; *) | |
(* comment: ; *) | |
") | |
(def grammar-terminals | |
{:character (combi/regexp terminal/character-pattern) | |
:string (combi/regexp terminal/string-pattern) | |
:symbol (combi/regexp terminal/symbol-pattern) | |
:number (combi/regexp terminal/number-pattern) | |
:macro-keyword (combi/regexp terminal/macro-keyword-pattern) | |
:keyword (combi/regexp terminal/keyword-pattern) | |
:whitespace (combi/regexp terminal/whitespace-pattern) | |
:comment (combi/regexp terminal/comment-pattern)}) | |
(def grammar (merge (cfg/ebnf grammar-rules) grammar-terminals)) | |
(def clojure | |
"Clojure (instaparse) parser. It can be used as: | |
- (parcera/clojure input-string) | |
-> returns an AST representation of input-string | |
- (instaparse/parse parcera/clojure input-string) | |
-> same as above but more explicit | |
- (instaparse/parses parcera/clojure input-string) | |
-> returns a sequence of possible AST representations in case of ambiguity | |
in input-string | |
For a description of all possible options, visit Instaparse's official | |
documentation: https://github.com/Engelberg/instaparse#reference" | |
(instaparse/parser grammar :start :code)) | |
(defn- code* | |
"internal function used to imperatively build up the code from the provided | |
AST as Clojure's str would be too slow" | |
[ast #?(:clj ^StringBuilder string-builder | |
:cljs ^StringBuffer string-builder)] | |
(case (first ast) | |
:code | |
(doseq [child (rest ast)] | |
(code* child string-builder)) | |
:list | |
(do (. string-builder (append "(")) | |
(doseq [child (rest ast)] (code* child string-builder)) | |
(. string-builder (append ")"))) | |
:vector | |
(do (. string-builder (append "[")) | |
(doseq [child (rest ast)] (code* child string-builder)) | |
(. string-builder (append "]"))) | |
:namespaced-map | |
(do (. string-builder (append "#")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:map | |
(do (. string-builder (append "{")) | |
(doseq [child (rest ast)] (code* child string-builder)) | |
(. string-builder (append "}"))) | |
:set | |
(do (. string-builder (append "#{")) | |
(doseq [child (rest ast)] (code* child string-builder)) | |
(. string-builder (append "}"))) | |
(:number :whitespace :comment :symbol :character :string | |
:keyword :macro-keyword) | |
(. string-builder (append (second ast))) | |
:symbolic | |
(do (. string-builder (append "##")) | |
(. string-builder (append (second ast)))) | |
:regex | |
(do (. string-builder (append "#")) | |
(. string-builder (append (second ast)))) | |
:auto-resolve | |
(. string-builder (append "::")) | |
:metadata | |
(do (doseq [child (rest (butlast ast))] (code* child string-builder)) | |
(code* (last ast) string-builder)) | |
:metadata-entry | |
(do (. string-builder (append "^")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:deprecated-metadata-entry | |
(do (. string-builder (append "#^")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:quote | |
(do (. string-builder (append "'")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:var-quote | |
(do (. string-builder (append "#'")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:discard | |
(do (. string-builder (append "#_")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:tag | |
(do (. string-builder (append "#")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:backtick | |
(do (. string-builder (append "`")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:unquote | |
(do (. string-builder (append "~")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:unquote-splicing | |
(do (. string-builder (append "~@")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:conditional | |
(do (. string-builder (append "#?")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:conditional-splicing | |
(do (. string-builder (append "#?@")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:deref | |
(do (. string-builder (append "@")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:fn | |
(do (. string-builder (append "#")) | |
(doseq [child (rest ast)] (code* child string-builder))) | |
:eval | |
(do (. string-builder (append "#=")) | |
(doseq [child (rest ast)] (code* child string-builder))))) | |
(defn code | |
"Transforms your AST back into code | |
ast: The nested sequence of [:keyword & content] which MUST follow the | |
same structure as the result of `(parcera/clojure input-string)` | |
Returns a string representation of the provided AST | |
In general (= input (parcera/code (parcera/clojure input)))" | |
[ast] | |
(let [string-builder #?(:clj (new StringBuilder) | |
:cljs (new StringBuffer))] | |
(code* ast string-builder) | |
(. string-builder (toString)))) | |
; Successful parse. | |
; Profile: {:create-node 384, :push-full-listener 2, :push-stack 384, | |
; :push-listener 382, :push-result 227, :push-message 227 } | |
; "Elapsed time: 47.25084 msecs" | |
#_(time (clojure (str '(ns parcera.core | |
(:require [instaparse.core :as instaparse] | |
[clojure.data :as data] | |
[clojure.string :as str]))) | |
:trace true)) | |
#_(instaparse/disable-tracing!) | |
(comment | |
;; 4400 ms! | |
(let [start (System/currentTimeMillis) | |
src (slurp (str (System/getenv "HOME") | |
"/src/clojure/src/clj/clojure/core.clj")) | |
ast (try | |
(clojure src) | |
(catch Exception e | |
(println e) | |
nil))] | |
(when ast | |
(let [reconstructed (code ast)] | |
(when-not (= reconstructed src) | |
(spit "/tmp/failed-reconstruction.clj" | |
reconstructed) | |
(println "failed to reconstruct")))) | |
(println (- (System/currentTimeMillis) start) "ms")) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns parcera.terminals | |
"Clojure symbols, keywords, numbers and string/regex share quite a lot | |
of matching logic. This namespace is aimed towards clearly identifying | |
those pieces and share them among the different definitions to | |
avoid recurring issues") | |
;; Clojure's reader is quite permissive so we follow the motto | |
;; "if it is not forbidden, it is allowed" | |
(def not-allowed "\\f\\n\\r\\t \\(\\)\\[\\]{}\"@~\\^;`\\\\,:#\\'\\/") | |
(def allowed-character (str "[^" not-allowed "]")) | |
(def keyword-head (str "(" | |
allowed-character "|" "[#']" | |
")")) | |
(def keyword-body (str "(" | |
keyword-head "|" "[:\\/]" | |
")")) | |
(def keyword-pattern (str ":" | |
"(" | |
, "(" keyword-head keyword-body "*" ")" | |
, "|" | |
, "\\/" | |
")")) | |
(def macro-keyword-pattern (str "::" keyword-head keyword-body "*")) | |
(def name-head (str "(" | |
allowed-character "|" "[\\/]" | |
")")) | |
(def name-body (str "(" | |
name-head "|" "[:#\\']" | |
")")) | |
(def symbol-pattern (str name-head name-body "*")) | |
(def double-suffix "(((\\.\\d*)?([eE][-+]?\\d+)?)M?)") | |
(def long-suffix "((0[xX]([\\dA-Fa-f]+)|0([0-7]+)|([1-9]\\d?)[rR]([\\d\\w]+)|0\\d+)?N?)") | |
(def ratio-suffix "(\\/(\\d+))") | |
;; XXX: order ok here? | |
(def number-pattern (str "[+-]?\\d+(" long-suffix "|" double-suffix "|" ratio-suffix ")(?![\\.\\/])")) ; todo: word boundary ? | |
; This is supposed to be the JavaScript friendly version of #'\P{M}\p{M}*+' | |
; mentioned here: https://www.regular-expressions.info/unicode.html | |
; It's cooked by this generator: http://kourge.net/projects/regexp-unicode-block | |
; ticking all 'Combining Diacritical Marks' boxes *)) | |
(def unicode-char "([^\\u0300-\\u036F\\u1DC0-\\u1DFF\\u20D0-\\u20FF])") | |
(def named-char "(newline|return|space|tab|formfeed|backspace)") | |
(def unicode "(u[\\dA-Fa-f]{4})") | |
(def character-pattern (str "\\\\(" unicode-char "|" named-char "|" unicode ")(?!\\w+)")) | |
(def string-pattern "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"") | |
;;(def regex-pattern (str "#" string-pattern)) | |
(def whitespace-pattern "[\f\n\r\t, ]+") | |
(def comment-pattern "(;|(#!))[^\r\n]*") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment