Skip to content

Instantly share code, notes, and snippets.

@pkoppstein
Last active January 19, 2023 00:48
Show Gist options
  • Save pkoppstein/addaedbb10b6fc97ff2b6b00123700ad to your computer and use it in GitHub Desktop.
Save pkoppstein/addaedbb10b6fc97ff2b6b00123700ad to your computer and use it in GitHub Desktop.
module {
"name": "xml",
"description": "PEG parser for XML",
"version": "0.0.1",
"homepage": "https://gist.github.com/pkoppstein/addaedbb10b6fc97ff2b6b00123700ad",
"license": "MIT",
"author": "pkoppstein at gmail dot com"
};
# This is a standalone jq module that has been tested with jq, gojq, and fq.
# See the end of this file for example invocations.
# The main goal of this XML parser is to translate valid XML documents
# into valid JSON losslessly, not to check for validity. Thus the
# <?xml ... ?> header is optional, and "white space" is preserved when
# significant in accordance with the XML specification. However, a
# filter, `jsonify`, is provided for removing strings of the form
# '\n *$' in the "text" portions of the XML document. This filter also
# converts hex character codes of the form `&#x....;' to the
# corresponding character, e.g. "&#x00C9;mily" -> "Émily".
# Since "duplicate attribute names within a tag are not permitted with XML",
# we can group the attributes within a tag as a JSON object, as jq respects key ordering.
# Also, since XML tags cannot begin with `@`, PROLOG is rendered as a
# JSON object with key "@PROLOG" and likewise for COMMENT, DTD and CDATA.
# Consecutive attribute-value pairs are grouped together under "@attributes".
# The grammar is primarily adapted from:
# (1) https://peerj.com/preprints/1503/
# (2) https://cs.lmu.edu/~ray/notes/xmlgrammar/
# with the notable exception that (1) forgets to allow comments.
# Caveats
# 1) It has not been determined whether there are valid XML documents
# which this parser would not recognize.
# 2) It has not been determined whether XML comments will always be recognized as such.
# Note that XML disallows comments:
# . before the XML declaration and within comments
# . within attribute values
# Note also that in the XML grammar, `Name` cannot begin with "@" or "." per (2),
# which defines Name as follows:
# NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
# Name ::= (Letter | '_' | ':') (NameChar)*
#########################################################
# PEG-to-jq transcription is based on these equivalences:
# Sequence: e1 e2 e1 | e2
# Ordered choice: e1 / e2 e1 // e2
# Zero-or-more: e* star(E)
# One-or-more: e+ plus(E)
# Optional: e? optional(E)
# And-predicate: &e amp(E) # no input is consumed
# Not-predicate: !e neg(E) # no input is consumed
# The idea is to pass a JSON object {remainder:_, result:_ } through a
# pipeline, consuming the text in .remainder and building up .result.
def star(E): ((E | star(E)) // .) ;
def plus(E): E | (plus(E) // . );
def optional(E): (E // .);
def amp(E): . as $in | E | $in;
def neg(E): select( [E] == [] );
### Helper functions:
# Consume a regular expression rooted at the start of .remainder, or emit empty;
# on success, update .remainder and set .match but do NOT update .result
def consume($re):
# on failure, match yields empty
(.remainder | match("^" + $re)) as $match
| .remainder |= .[$match.length :]
| .match = $match.string;
def parse($re):
consume($re)
| .result = .result + [.match] ;
def parseNumber($re):
consume($re)
| .result = .result + [.match|tonumber] ;
# consume the literal string $s
def q($s):
select(.remainder | startswith($s))
| .remainder |= .[$s | length :] ;
def literal($s):
q($s)
| .result += [$s];
def nonempty: select( (.remainder | length) > 0 );
def eos: select(.remainder | length == 0);
# required white space
def _: consume("[ \n\r\t]+");
# optional white space
def ws: consume("[ \n\r\t]*");
# Tagging
def box(E):
((.result = null) | E) as $e
| .remainder = $e.remainder
| .result += [$e.result] # the magic sauce
;
def box(name; E):
((.result = null) | E) as $e
| .remainder = $e.remainder
| .result += [{(name): (try ($e.result|join("")) catch $e.result) }] # the magic sauce
;
# A string that does NOT contain $regex
def string_except($regex):
box(star(neg( parse($regex) ) | parse("."))) | .result[-1] |= add;
def objectify(E):
box(E)
| .result[-1] |= {(.[0]): .[1:]} ;
def keyvalue(E):
box(E)
| .result[-1] |= {(.[0]): .[1]} ;
#########################################################
def XML:
def _ : consume("[ \n\r\t]"); # exactly one
def String : ((consume("\"") | parse("[^\"]*") | consume("\"")) //
(consume("'") | parse("[^']*") | consume("'")));
def CDataSec : box("@CDATA"; q("<![CDATA[") | string_except("]]>") | q("]]>") ) | ws;
def PROLOG : box("@PROLOG"; q("<?xml") | string_except("\\?>") | q("?>"));
def DTD : box("@DTD"; q("<!") | parse("[^>]") | q(">"));
# The XML spec specifically disallows double-hyphen within comments
def COMMENT : box("@COMMENT"; q("<!--") | string_except("--") | q("-->"));
def CharData : parse("[^<]+"); # `<` and '&' are disallowed per W3C but entity references require '&'
# This is more permissive than required:
def Name : parse("[A-Za-z:_][^/=<>\n\r\t ]*");
def Attribute : keyvalue(Name | ws | q("=") | ws | String | ws);
def Attributes: box( plus(Attribute) ) | .result[-1] |= {"@attributes": add} ;
# <foo> must be matched with </foo>
def Element :
def Content : star(Element // CDataSec // CharData // COMMENT);
objectify( q("<")
| Name
| .result[-1] as $name
| ws
| (Attributes // ws)
| ( (q("/>")
// (q(">") | Content | q("</") | q($name) | ws | q(">")))
| ws) ) ;
{remainder: . }
| ws
| optional(PROLOG) | ws
| optional(DTD) | ws
| star(COMMENT | ws)
| Element | ws # for HTML, one would use star(Element) here
| star(COMMENT | ws)
| .result;
def hex2i:
def toi: if . >= 87 then .-87 else . - 48 end;
reduce ( ascii_downcase | explode | map(toi) | reverse[]) as $i ([1, 0]; # [power, sum]
.[1] += $i * .[0]
| .[0] *= 16 )
| .[1];
def hexcode2json:
gsub("&#x(?<x>....);" ; .x | [hex2i] | implode) ;
def jsonify:
walk( if type == "array"
then map(select(type == "string" and test("^\n *$") | not))
elif type == "string" then hexcode2json
else . end);
# Usage:
# The following examples assumes that this module is named xml.jq
# jq -R 'include "xml" {search: "."}; XML'
# jq -R 'include "xml"; XML | jsonify[]'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment