pkoppstein/xml.jq

## xml.jq
module {
  "name": "xml",
  "description": "PEG parser for XML",
  "version": "0.0.1",
  "homepage": "https://gist.github.com/pkoppstein/addaedbb10b6fc97ff2b6b00123700ad",
  "license": "MIT",
  "author": "pkoppstein at gmail dot com"
};

# This is a standalone jq module that has been tested with jq, gojq, and fq.
# See the end of this file for example invocations.

# The main goal of this XML parser is to translate valid XML documents
# into valid JSON losslessly, not to check for validity.  Thus the
# <?xml ... ?> header is optional, and "white space" is preserved when
# significant in accordance with the XML specification.  However, a
# filter, `jsonify`, is provided for removing strings of the form
# '\n *$' in the "text" portions of the XML document. This filter also
# converts hex character codes of the form `&#x....;' to the
# corresponding character, e.g. "&#x00C9;mily" -> "Émily".

# Since "duplicate attribute names within a tag are not permitted with XML",
# we can group the attributes within a tag as a JSON object, as jq respects key ordering.

# Also, since XML tags cannot begin with `@`, PROLOG is rendered as a
# JSON object with key "@PROLOG" and likewise for COMMENT, DTD and CDATA.
# Consecutive attribute-value pairs are grouped together under "@attributes".

# The grammar is primarily adapted from:
# (1) https://peerj.com/preprints/1503/
# (2) https://cs.lmu.edu/~ray/notes/xmlgrammar/
# with the notable exception that (1) forgets to allow comments.

# Caveats
# 1) It has not been determined whether there are valid XML documents
#    which this parser would not recognize.
# 2) It has not been determined whether XML comments will always be recognized as such.

# Note that XML disallows comments:
#  . before the XML declaration and within comments
#  . within attribute values

# Note also that in the XML grammar, `Name` cannot begin with "@" or "." per (2),
# which defines Name as follows:
# NameChar  ::=  Letter | Digit |  '.' | '-' | '_' | ':' |  CombiningChar | Extender
# Name      ::=  (Letter | '_' | ':') (NameChar)*

#########################################################
# PEG-to-jq transcription is based on these equivalences:
# Sequence: e1 e2             e1 | e2
# Ordered choice: e1 / e2     e1 // e2
# Zero-or-more: e*            star(E)
# One-or-more: e+             plus(E)
# Optional: e?                optional(E)
# And-predicate: &e           amp(E)      # no input is consumed
# Not-predicate: !e           neg(E)      # no input is consumed

# The idea is to pass a JSON object {remainder:_, result:_ } through a
# pipeline, consuming the text in .remainder and building up .result.

def star(E): ((E | star(E)) // .) ;
def plus(E): E | (plus(E) // . );
def optional(E): (E // .);
def amp(E): . as $in | E | $in;
def neg(E): select( [E] == [] );

### Helper functions:

# Consume a regular expression rooted at the start of .remainder, or emit empty;
# on success, update .remainder and set .match but do NOT update .result
def consume($re):
  # on failure, match yields empty
  (.remainder | match("^" + $re)) as $match
  | .remainder |= .[$match.length :]
  | .match = $match.string;

def parse($re):
  consume($re)
  | .result = .result + [.match] ;

def parseNumber($re):
  consume($re)
  | .result = .result + [.match|tonumber] ;

# consume the literal string $s
def q($s):
  select(.remainder | startswith($s))
  | .remainder |= .[$s | length :] ;

def literal($s):
  q($s)
  | .result += [$s];

def nonempty: select( (.remainder | length) > 0 );

def eos: select(.remainder | length == 0);

# required white space
def _: consume("[ \n\r\t]+");

# optional white space
def ws: consume("[ \n\r\t]*");

# Tagging
def box(E):
  ((.result = null) | E) as $e
  | .remainder = $e.remainder
  | .result += [$e.result]  # the magic sauce
  ;

def box(name; E):
  ((.result = null) | E) as $e
  | .remainder = $e.remainder
  | .result += [{(name): (try ($e.result|join("")) catch $e.result) }]  # the magic sauce
  ;

# A string that does NOT contain $regex
def string_except($regex):
  box(star(neg( parse($regex) ) | parse("."))) | .result[-1] |= add;

def objectify(E):
  box(E)
  | .result[-1] |= {(.[0]): .[1:]} ;

def keyvalue(E):
  box(E)
  | .result[-1] |= {(.[0]): .[1]} ;

#########################################################

def XML:
  def _         : consume("[ \n\r\t]");  # exactly one
  def String    : ((consume("\"") | parse("[^\"]*") | consume("\"")) //
                   (consume("'") | parse("[^']*") | consume("'")));

  def CDataSec  : box("@CDATA";  q("<![CDATA[") | string_except("]]>") | q("]]>") ) | ws;
  def PROLOG    : box("@PROLOG"; q("<?xml") | string_except("\\?>") | q("?>"));
  def DTD       : box("@DTD";    q("<!") | parse("[^>]") | q(">"));
  # The XML spec specifically disallows double-hyphen within comments
  def COMMENT   : box("@COMMENT"; q("<!--") | string_except("--") | q("-->"));

  def CharData  : parse("[^<]+");  # `<` and '&' are disallowed per W3C but entity references require '&'

  # This is more permissive than required:
  def Name      : parse("[A-Za-z:_][^/=<>\n\r\t ]*");

  def Attribute : keyvalue(Name | ws | q("=") | ws | String | ws);
  def Attributes: box( plus(Attribute) ) | .result[-1] |= {"@attributes": add} ;

  # <foo> must be matched with </foo>
  def Element   :
    def Content : star(Element // CDataSec // CharData // COMMENT);
    objectify( q("<")
         | Name
         | .result[-1] as $name
	 | ws
         | (Attributes // ws)
         | (  (q("/>")
	   // (q(">") | Content | q("</") | q($name) | ws | q(">")))
         | ws) ) ;

  {remainder: . }
  | ws
  | optional(PROLOG) | ws
  | optional(DTD) | ws
  | star(COMMENT | ws)
  | Element | ws             # for HTML, one would use star(Element) here
  | star(COMMENT | ws)
  | .result;

def hex2i:
  def toi: if . >= 87 then .-87 else . - 48 end;
  reduce ( ascii_downcase | explode | map(toi) | reverse[]) as $i ([1, 0]; # [power, sum]
    .[1] += $i * .[0]
    | .[0] *= 16 )
  | .[1];

def hexcode2json:
  gsub("&#x(?<x>....);" ; .x | [hex2i] | implode) ;

def jsonify:
  walk( if type == "array"
        then map(select(type == "string" and test("^\n *$") | not))
	elif type == "string" then hexcode2json
	else . end);

# Usage:
# The following examples assumes that this module is named xml.jq
# jq -R 'include "xml" {search: "."}; XML'
# jq -R 'include "xml"; XML | jsonify[]'
	module {
	"name": "xml",
	"description": "PEG parser for XML",
	"version": "0.0.1",
	"homepage": "https://gist.github.com/pkoppstein/addaedbb10b6fc97ff2b6b00123700ad",
	"license": "MIT",
	"author": "pkoppstein at gmail dot com"
	};

	# This is a standalone jq module that has been tested with jq, gojq, and fq.
	# See the end of this file for example invocations.

	# The main goal of this XML parser is to translate valid XML documents
	# into valid JSON losslessly, not to check for validity. Thus the
	# <?xml ... ?> header is optional, and "white space" is preserved when
	# significant in accordance with the XML specification. However, a
	# filter, `jsonify`, is provided for removing strings of the form
	# '\n *$' in the "text" portions of the XML document. This filter also
	# converts hex character codes of the form `&#x....;' to the
	# corresponding character, e.g. "Émily" -> "Émily".

	# Since "duplicate attribute names within a tag are not permitted with XML",
	# we can group the attributes within a tag as a JSON object, as jq respects key ordering.

	# Also, since XML tags cannot begin with `@`, PROLOG is rendered as a
	# JSON object with key "@PROLOG" and likewise for COMMENT, DTD and CDATA.
	# Consecutive attribute-value pairs are grouped together under "@attributes".

	# The grammar is primarily adapted from:
	# (1) https://peerj.com/preprints/1503/
	# (2) https://cs.lmu.edu/~ray/notes/xmlgrammar/
	# with the notable exception that (1) forgets to allow comments.

	# Caveats
	# 1) It has not been determined whether there are valid XML documents
	# which this parser would not recognize.
	# 2) It has not been determined whether XML comments will always be recognized as such.

	# Note that XML disallows comments:
	# . before the XML declaration and within comments
	# . within attribute values

	# Note also that in the XML grammar, `Name` cannot begin with "@" or "." per (2),
	# which defines Name as follows:
	# NameChar ::= Letter \| Digit \| '.' \| '-' \| '_' \| ':' \| CombiningChar \| Extender
	# Name ::= (Letter \| '_' \| ':') (NameChar)*

	#########################################################
	# PEG-to-jq transcription is based on these equivalences:
	# Sequence: e1 e2 e1 \| e2
	# Ordered choice: e1 / e2 e1 // e2
	# Zero-or-more: e* star(E)
	# One-or-more: e+ plus(E)
	# Optional: e? optional(E)
	# And-predicate: &e amp(E) # no input is consumed
	# Not-predicate: !e neg(E) # no input is consumed

	# The idea is to pass a JSON object {remainder:_, result:_ } through a
	# pipeline, consuming the text in .remainder and building up .result.

	def star(E): ((E \| star(E)) // .) ;
	def plus(E): E \| (plus(E) // . );
	def optional(E): (E // .);
	def amp(E): . as $in \| E \| $in;
	def neg(E): select( [E] == [] );

	### Helper functions:

	# Consume a regular expression rooted at the start of .remainder, or emit empty;
	# on success, update .remainder and set .match but do NOT update .result
	def consume($re):
	# on failure, match yields empty
	(.remainder \| match("^" + $re)) as $match
	\| .remainder \|= .[$match.length :]
	\| .match = $match.string;

	def parse($re):
	consume($re)
	\| .result = .result + [.match] ;

	def parseNumber($re):
	consume($re)
	\| .result = .result + [.match\|tonumber] ;

	# consume the literal string $s
	def q($s):
	select(.remainder \| startswith($s))
	\| .remainder \|= .[$s \| length :] ;

	def literal($s):
	q($s)
	\| .result += [$s];

	def nonempty: select( (.remainder \| length) > 0 );

	def eos: select(.remainder \| length == 0);

	# required white space
	def _: consume("[ \n\r\t]+");

	# optional white space
	def ws: consume("[ \n\r\t]*");

	# Tagging
	def box(E):
	((.result = null) \| E) as $e
	\| .remainder = $e.remainder
	\| .result += [$e.result] # the magic sauce
	;

	def box(name; E):
	((.result = null) \| E) as $e
	\| .remainder = $e.remainder
	\| .result += [{(name): (try ($e.result\|join("")) catch $e.result) }] # the magic sauce
	;

	# A string that does NOT contain $regex
	def string_except($regex):
	box(star(neg( parse($regex) ) \| parse("."))) \| .result[-1] \|= add;

	def objectify(E):
	box(E)
	\| .result[-1] \|= {(.[0]): .[1:]} ;

	def keyvalue(E):
	box(E)
	\| .result[-1] \|= {(.[0]): .[1]} ;

	#########################################################

	def XML:
	def _ : consume("[ \n\r\t]"); # exactly one
	def String : ((consume("\"") \| parse("[^\"]*") \| consume("\"")) //
	(consume("'") \| parse("[^']*") \| consume("'")));

	def CDataSec : box("@CDATA"; q("<![CDATA[") \| string_except("]]>") \| q("]]>") ) \| ws;
	def PROLOG : box("@PROLOG"; q("<?xml") \| string_except("\\?>") \| q("?>"));
	def DTD : box("@DTD"; q("<!") \| parse("[^>]") \| q(">"));
	# The XML spec specifically disallows double-hyphen within comments
	def COMMENT : box("@COMMENT"; q("<!--") \| string_except("--") \| q("-->"));

	def CharData : parse("[^<]+"); # `<` and '&' are disallowed per W3C but entity references require '&'

	# This is more permissive than required:
	def Name : parse("[A-Za-z:_][^/=<>\n\r\t ]*");

	def Attribute : keyvalue(Name \| ws \| q("=") \| ws \| String \| ws);
	def Attributes: box( plus(Attribute) ) \| .result[-1] \|= {"@attributes": add} ;

	# <foo> must be matched with </foo>
	def Element :
	def Content : star(Element // CDataSec // CharData // COMMENT);
	objectify( q("<")
	\| Name
	\| .result[-1] as $name
	\| ws
	\| (Attributes // ws)
	\| ( (q("/>")
	// (q(">") \| Content \| q("</") \| q($name) \| ws \| q(">")))
	\| ws) ) ;

	{remainder: . }
	\| ws
	\| optional(PROLOG) \| ws
	\| optional(DTD) \| ws
	\| star(COMMENT \| ws)
	\| Element \| ws # for HTML, one would use star(Element) here
	\| star(COMMENT \| ws)
	\| .result;

	def hex2i:
	def toi: if . >= 87 then .-87 else . - 48 end;
	reduce ( ascii_downcase \| explode \| map(toi) \| reverse[]) as $i ([1, 0]; # [power, sum]
	.[1] += $i * .[0]
	\| .[0] *= 16 )
	\| .[1];

	def hexcode2json:
	gsub("&#x(?<x>....);" ; .x \| [hex2i] \| implode) ;

	def jsonify:
	walk( if type == "array"
	then map(select(type == "string" and test("^\n *$") \| not))
	elif type == "string" then hexcode2json
	else . end);

	# Usage:
	# The following examples assumes that this module is named xml.jq
	# jq -R 'include "xml" {search: "."}; XML'
	# jq -R 'include "xml"; XML \| jsonify[]'