bpsm/edn.wsn

## edn.wsn
(* Syntax of Extensible Data Notation -- http://github.com/edn-format/edn

See https://github.com/edn-format/edn/issues/56

This grammar is written in slightly extended version of Wirth Syntax
Notation. A description is appended to the end of this document. *)


(* start *)

elements = { s element }.

element = nil | boolean | symbol | keyword | number | character
      | string
      | "{" s { element s element s } "}"
      | "#{" elements s "}"
      | "[" elements s "]"
      | "(" elements s ")"
      | "#" tagSymbol s element.
    (* White space is allowed between elements, but not always required.
    For example: {}{} parses as two empty maps. "MeaningOfLife"42 parses as a
    String followed by an integer. However, a:b parses as a single symbol, not
    as the symbol 'a' followed by the keyword ':b'. *)


(* white space *)

s = { whitespace | comment | discardedElement }.
    (* white space may occur here. *)

whitespace = " " | HT | LF | CR | ",".

comment = ";" { commentContent } LF.
    (* https://github.com/edn-format/edn/issues/31 states that
    only \newline (i.e. LF) terminates comments. *)

commentContent = U+1 | … | U+9 | U+B | … | MaxCodePoint.
    (* all characters except NUL, LF. *)

discardedElement = "#_" s element.
    (* This implies that in "#_ #_ 2 1", the second #_ causes 2 to be
    discarded, while the first #_ causes 1 to be discarded. The result
    is that this example might as well be whitespace as far as edn
    is concerned. *)

HT  = U+9.

LF  = U+A.

CR  = U+D.

MaxCodePoint = U+10FFFF.


(* symbols *)

symbol = "/" | [name  "/"]  name.

tagSymbol = letter [ name "/" ] name.
    (* https://github.com/edn-format/edn/issues/30#issuecomment-8540641
    "tag symbols must begin with an alphabetic character" *)

keyword = ":" [name "/"] name.
    (* https://github.com/edn-format/edn/issues/32 says :/ is not a valid
    keyword *)

name = nameStart1 { nameConstituent }
     | nameStart2 [ letter { nameConstituent } ].

nameStart1 = "!" | "*" | "?" | "_" | "$" | "%" | "&" | "=" | letter.

nameStart2 = "." | "-" | "+".

nameConstituent = nameStart1 | nameStart2 | digit | "#" | ":".

letter = "a" | … | "z" | "A" | … | "Z".

digit = "0" | nonZeroDigit.

nonZeroDigit = "1" | … | "9".

(* "true", "false" and "nil" look like symbols, but are not parsed as such. *)

boolean = "true" | "false".

nil = "nil".


(* numbers *)

number = integer | float.

integer = [ "+" | "-" ] cardinal [ "N" ].
    (* We do not allow supurfluous leading zeros in integers, though
    README.md allows them. clojure.core/read and clojure.edn/read both
    allow leading zeros here but interpret the remaining digits in
    base 8! See also issue 33.

    edn-java allows leading zeros, but gives them no special
    meaning. *)

float = [ "+" | "-" ] cardinal ((frac [exp] ["M"]) | (exp ["M"]) |  "M").
    (* This syntax for float disagrees with the formal syntax from the spec:
    https://github.com/edn-format/edn#floating-point-numbers, but does so in
    order to comply with "In addition, a floating-point number may have the
    suffix M to indicate that exact precision is desired."

    The grammar in README.md does not allow leading zeros in the
    integer and exponent portions of a float. clojure.core/read,
    clojure.edn/read and edn-java all accept leading zeros in these
    cases. See issue 33.*)

frac =   "." { digit }.
    (* The fractional portion can consist of only a "." not followed by any
    digits. This is consistent with the current spec and with the behavior
    of clojure.core/read and clojure.edn/read *)

exp = ("E" | "e") ["+" | "-"] cardinal.

cardinal = digit | nonZeroDigit { digit }.


(* characters *)

character = "\" (characterName | printableCharacter).

characterName = "newline" | "space" | "tab" | "return"
              | "backspace" | "formfeed".
    (* The specification only mentions the first four explicity;
    Backspace and formfeed are included for symmetry with string
    and because clojure.edn/read and clojure.core/read support them. *)

printableCharacter = "!" | … | "~" | U+A1 | … | MaxCodePoint.
    (* all code points except high and low control characters.
    This is sloppy. There are probably other Unicode code points that
    we don't want to use in character literals because they have no
    printed representation or perform some control function. *)


(* strings *)

string = """" {stringChar | "\" stringEscape} """".

stringChar = U+1 | … | "!" | "#" | … | "[" | "]" | … | MaxCodePoint.
    (* all code points except NUL, " and \ *)

stringEscape = """" | "b" | "t" | "n" | "f" | "r" | "\".
    (* edn-format/edn only mentions "Standard C/Java escape characters \t \r \n
    are supported", but clearly \\ and \" must be included. \b \f are included
    because they are supported by Java. \' is excluded despite being supported
    by Java because Clojure rejects \'. *)


(*
Wirth Syntax Notation
---------------------

http://en.wikipedia.org/wiki/Wirth_syntax_notation

SYNTAX     = { PRODUCTION } .
PRODUCTION = IDENTIFIER "=" EXPRESSION "." .
EXPRESSION = TERM { "|" TERM } .
TERM       = FACTOR { FACTOR } .
FACTOR     = IDENTIFIER
           | LITERAL
           | "[" EXPRESSION "]"
           | "(" EXPRESSION ")"
           | "{" EXPRESSION "}" .
IDENTIFIER = letter { letter } .
LITERAL    = """" character { character } """" .

An Extension to WSN to Represent Unicode Codepoints
---------------------------------------------------

Edn's syntax is specified in terms of unicode code points, the first
128 of which are identical to US-ASCII. (Edn is always serialized as
UTF-8, which can represent the full set of unicode code points.)

For the purposes of this grammar, we'll use the following extension to
represent a unicode code point:

HexDigit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
         | "A" | "B" | "C" | "D" | "E" | "F".

CodePoint = "U+" HexDigit { HexDigit }.

For example: NUL is written as U+0, " " can be written as U+20 and
"~" can be written as U+7F.

To represent large contiguous subsets of the unicode codepoints, we
use an elipsis as follows:

HexDigit = "0" | … | "9" | "A" | … | "F".
*)
	(* Syntax of Extensible Data Notation -- http://github.com/edn-format/edn

	See https://github.com/edn-format/edn/issues/56

	This grammar is written in slightly extended version of Wirth Syntax
	Notation. A description is appended to the end of this document. *)



	(* start *)

	elements = { s element }.

	element = nil \| boolean \| symbol \| keyword \| number \| character
	\| string
	\| "{" s { element s element s } "}"
	\| "#{" elements s "}"
	\| "[" elements s "]"
	\| "(" elements s ")"
	\| "#" tagSymbol s element.
	(* White space is allowed between elements, but not always required.
	For example: {}{} parses as two empty maps. "MeaningOfLife"42 parses as a
	String followed by an integer. However, a:b parses as a single symbol, not
	as the symbol 'a' followed by the keyword ':b'. *)



	(* white space *)

	s = { whitespace \| comment \| discardedElement }.
	(* white space may occur here. *)

	whitespace = " " \| HT \| LF \| CR \| ",".

	comment = ";" { commentContent } LF.
	(* https://github.com/edn-format/edn/issues/31 states that
	only \newline (i.e. LF) terminates comments. *)

	commentContent = U+1 \| … \| U+9 \| U+B \| … \| MaxCodePoint.
	(* all characters except NUL, LF. *)

	discardedElement = "#_" s element.
	(* This implies that in "#_ #_ 2 1", the second #_ causes 2 to be
	discarded, while the first #_ causes 1 to be discarded. The result
	is that this example might as well be whitespace as far as edn
	is concerned. *)

	HT = U+9.

	LF = U+A.

	CR = U+D.

	MaxCodePoint = U+10FFFF.



	(* symbols *)

	symbol = "/" \| [name "/"] name.

	tagSymbol = letter [ name "/" ] name.
	(* https://github.com/edn-format/edn/issues/30#issuecomment-8540641
	"tag symbols must begin with an alphabetic character" *)

	keyword = ":" [name "/"] name.
	(* https://github.com/edn-format/edn/issues/32 says :/ is not a valid
	keyword *)

	name = nameStart1 { nameConstituent }
	\| nameStart2 [ letter { nameConstituent } ].

	nameStart1 = "!" \| "*" \| "?" \| "_" \| "$" \| "%" \| "&" \| "=" \| letter.

	nameStart2 = "." \| "-" \| "+".

	nameConstituent = nameStart1 \| nameStart2 \| digit \| "#" \| ":".

	letter = "a" \| … \| "z" \| "A" \| … \| "Z".

	digit = "0" \| nonZeroDigit.

	nonZeroDigit = "1" \| … \| "9".

	(* "true", "false" and "nil" look like symbols, but are not parsed as such. *)

	boolean = "true" \| "false".

	nil = "nil".



	(* numbers *)

	number = integer \| float.

	integer = [ "+" \| "-" ] cardinal [ "N" ].
	(* We do not allow supurfluous leading zeros in integers, though
	README.md allows them. clojure.core/read and clojure.edn/read both
	allow leading zeros here but interpret the remaining digits in
	base 8! See also issue 33.

	edn-java allows leading zeros, but gives them no special
	meaning. *)

	float = [ "+" \| "-" ] cardinal ((frac [exp] ["M"]) \| (exp ["M"]) \| "M").
	(* This syntax for float disagrees with the formal syntax from the spec:
	https://github.com/edn-format/edn#floating-point-numbers, but does so in
	order to comply with "In addition, a floating-point number may have the
	suffix M to indicate that exact precision is desired."

	The grammar in README.md does not allow leading zeros in the
	integer and exponent portions of a float. clojure.core/read,
	clojure.edn/read and edn-java all accept leading zeros in these
	cases. See issue 33.*)

	frac = "." { digit }.
	(* The fractional portion can consist of only a "." not followed by any
	digits. This is consistent with the current spec and with the behavior
	of clojure.core/read and clojure.edn/read *)

	exp = ("E" \| "e") ["+" \| "-"] cardinal.

	cardinal = digit \| nonZeroDigit { digit }.



	(* characters *)

	character = "\" (characterName \| printableCharacter).

	characterName = "newline" \| "space" \| "tab" \| "return"
	\| "backspace" \| "formfeed".
	(* The specification only mentions the first four explicity;
	Backspace and formfeed are included for symmetry with string
	and because clojure.edn/read and clojure.core/read support them. *)

	printableCharacter = "!" \| … \| "~" \| U+A1 \| … \| MaxCodePoint.
	(* all code points except high and low control characters.
	This is sloppy. There are probably other Unicode code points that
	we don't want to use in character literals because they have no
	printed representation or perform some control function. *)



	(* strings *)

	string = """" {stringChar \| "\" stringEscape} """".

	stringChar = U+1 \| … \| "!" \| "#" \| … \| "[" \| "]" \| … \| MaxCodePoint.
	(* all code points except NUL, " and \ *)

	stringEscape = """" \| "b" \| "t" \| "n" \| "f" \| "r" \| "\".
	(* edn-format/edn only mentions "Standard C/Java escape characters \t \r \n
	are supported", but clearly \\ and \" must be included. \b \f are included
	because they are supported by Java. \' is excluded despite being supported
	by Java because Clojure rejects \'. *)



	(*
	Wirth Syntax Notation
	---------------------

	http://en.wikipedia.org/wiki/Wirth_syntax_notation

	SYNTAX = { PRODUCTION } .
	PRODUCTION = IDENTIFIER "=" EXPRESSION "." .
	EXPRESSION = TERM { "\|" TERM } .
	TERM = FACTOR { FACTOR } .
	FACTOR = IDENTIFIER
	\| LITERAL
	\| "[" EXPRESSION "]"
	\| "(" EXPRESSION ")"
	\| "{" EXPRESSION "}" .
	IDENTIFIER = letter { letter } .
	LITERAL = """" character { character } """" .

	An Extension to WSN to Represent Unicode Codepoints
	---------------------------------------------------

	Edn's syntax is specified in terms of unicode code points, the first
	128 of which are identical to US-ASCII. (Edn is always serialized as
	UTF-8, which can represent the full set of unicode code points.)

	For the purposes of this grammar, we'll use the following extension to
	represent a unicode code point:

	HexDigit = "0" \| "1" \| "2" \| "3" \| "4" \| "5" \| "6" \| "7" \| "8" \| "9"
	\| "A" \| "B" \| "C" \| "D" \| "E" \| "F".

	CodePoint = "U+" HexDigit { HexDigit }.

	For example: NUL is written as U+0, " " can be written as U+20 and
	"~" can be written as U+7F.

	To represent large contiguous subsets of the unicode codepoints, we
	use an elipsis as follows:

	HexDigit = "0" \| … \| "9" \| "A" \| … \| "F".
	*)