Skip to content

Instantly share code, notes, and snippets.

@pchiusano
Created February 11, 2011 15:26
Show Gist options
  • Save pchiusano/822494 to your computer and use it in GitHub Desktop.
Save pchiusano/822494 to your computer and use it in GitHub Desktop.
Adapter to use scala's parser combinators for XML parsing
package xmlcombinators
import scala.util.parsing.combinator.Parsers
import scala.util.parsing.input.{NoPosition, Reader}
import javax.xml.stream.events.{Attribute, EndElement, XMLEvent}
import javax.xml.stream.{XMLEventReader, XMLInputFactory}
import collection.mutable.ArrayBuffer
import java.io.File
/**
* License: MIT license (http://www.opensource.org/licenses/mit-license.php)
* Copyright (C) 2011 by Capital IQ
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @author Anthony Enache, Paul Chiusano
*/
/** Adapter to run parser combinators using XMLEvent as the token type.
* Currently works by grabbing the full list of events up front from a
* javax.xml.stream.XMLEventReader, so this is not suitable for streaming.
*
* Examples:
* startElement("foo") <~ endElement("foo") applied to <foo bar="qux" baz="juju"></foo>
* results in the Map[String,String] of attrs for the tag 'foo': Map("bar"->"qux", "baz"->"juju")
*
* startElement("foo") ~ rep(bar) <~ endElement("foo") map { case attrs ~ bars => ... }
*
* Generally, it's better to use the nest and element combinators, to avoid having to
* manually match start and end tags in the grammar.
*/
trait XMLEventParser extends Parsers {
type Elem = XMLEvent
implicit def listToJavaList[T](l: Seq[T]) = l.foldLeft(new java.util.ArrayList[T](l.size)) {
(al, e) => al.add(e)
al
}
/** We return a map of the attributes defined by the start element tag. */
def startElement(s: String): Parser[Map[String,String]] = new Parser[Map[String,String]] {
def apply(in: Input) = {
val elt = in.first
if ( (elt ne null) && elt.isStartElement && elt.asStartElement.getName.getLocalPart == s) {
val as = elt.asStartElement.getAttributes
var attributes = Map[String, String]()
while( as.hasNext ) {
val attr = as.next.asInstanceOf[Attribute]
attributes += ( attr.getName.getLocalPart -> attr.getValue )
}
Success(attributes, in.rest)
} else {
Failure("Expected start element with label " + s + ", but found " + elt, in.rest)
}
}
}
def endElement(s: String) : Parser[EndElement] = new Parser[EndElement] {
def apply(in: Input) = {
val elt = in.first
if ( elt.isEndElement && elt.asEndElement.getName.getLocalPart == s) {
Success(elt.asEndElement, in.rest)
} else {
Failure("Expected end element with label " + s + ", but found " + elt, in.rest)
}
}
}
/** Parser for leaf elements of type <tag attr1="" attr2="" .../>. From the reader, this would generate
* a start element and end element event, of which, only the start element is truly interesting as it
* carries the attributes with it.
*/
def element(s: String) : Parser[Map[String, String]] = startElement(s) <~ endElement(s)
/** Parser for non-leaf elements. */
def element[A,B](s: String, inner: Parser[A])(f: ((Map[String,String],A)) => B): Parser[B] =
startElement(s) ~ inner <~ endElement(s) ^^ { case attrs ~ a => f((attrs, a)) }
/** Parser for leaf elements with attributes. The function f extracts an A from these attrs. */
def element[A](s: String, f: Map[String,String] => A): Parser[A] =
startElement(s).map(f) <~ endElement(s)
/** Parser for elements surrounded by the tag withinTag, whose attributes are ignored. */
def nest[A](withinTag: String, inner: Parser[A]): Parser[A] =
element[A,A](withinTag, inner) { case (attrs,a) => a }
/** Typesafe choice between two parsers. */
def choice[A,B](p: Parser[A], p2: Parser[B]): Parser[Either[A,B]] =
p.map(a => Left(a)) | p2.map(b => Right(b))
/** Parser for text within a tag. Example:
* nest("foo", textElement) applied to '<foo>thisText</foo>' results in 'thisText'.
*/
def textElement: Parser[String] = new Parser[String] {
def apply(in: Input) = {
val elt = in.first
if (elt.isCharacters) {
Success(elt.asCharacters.getData, in.rest)
} else {
Failure("Expected text element, but found " + elt, in.rest)
}
}
}
/** Parser that extracts the text from an element <s>text</s> */
def textOnlyElement(s: String): Parser[String] = nest(s, textElement)
/** Parser that extracts an optional text only elelemnt. */
def optionalTextOnlyElement(s: String): Parser[Option[String]] = nest(s, opt(textElement))
/** Parse some prefix of reader `in' with parser `p' */
def parse[T](p: Parser[T], in: Reader[XMLEvent]): ParseResult[T] =
p(in)
/** Parse all of reader `in' with parser `p' */
def parseAll[T](p: Parser[T], in: Reader[XMLEvent]): ParseResult[T] =
parse(phrase(p), in)
/** Extract the mandatory attribute from the attribute map */
def attribute(as: Map[String, String], a: String) = {
require(as.contains(a))
as(a)
}
/** Extract the value of the attribute, if it exists, otherwise use the default */
def attribute(as: Map[String, String], a: String, default: String) = as.getOrElse(a, default)
}
object EventReader {
def readEvents(r: java.io.Reader): Array[XMLEvent] = {
var result = new ArrayBuffer[XMLEvent]()
val factory = XMLInputFactory.newInstance()
val reader : XMLEventReader = factory.createXMLEventReader(r)
while (reader.hasNext) {
var event = reader.nextEvent()
if ( !event.isStartDocument && !event.isEndDocument ) {
if ( event.isCharacters ) {
if ( !event.asCharacters.isWhiteSpace ) {
result += event
}
} else {
result += event
}
}
}
result.toArray
}
}
/**
* Note that this implementation of the header will eat whitespace including document start and end events
*/
class EventReader(index: Int, events: Array[XMLEvent] ) extends Reader[XMLEvent] {
def this(r: java.io.Reader) = this(0, EventReader.readEvents(r))
def atEnd = index == events.size - 1
def first = events(index)
def rest = if ( atEnd ) this else new EventReader(index + 1, events)
def pos = NoPosition
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment