Lazy parser for site maps XML
import scala.xml.pull._
import scalaz._
import Scalaz._
trait Input[F[_]] {
type =>>[A, B] = StateT[Option, F[A], B]
def uncons[A](f: F[A]): Option[(F[A], A)]
def next[A]: (A =>> A) = StateT { uncons }
def none[A, B] = Option.empty[(F[A], B)]
def empty[A, B]: (A =>> B) = StateT { _ => none }
def ret[A, B](b: B): (A =>> B) =
StateT { f => Option(f -> b) }
object Input {
def apply[F[_]: Input] = implicitly[Input[F]]
implicit val iterator = new Input[Iterator] {
override def uncons[A](
i: Iterator[A]): Option[(Iterator[A], A)] =
if (i.hasNext) Option( map (i -> _)
else none
implicit val stream = new Input[Stream] {
override def uncons[A](
s: Stream[A]): Option[(Stream[A], A)] =
s.headOption map (s.tail -> _)
object XmlParser {
type XML = XMLEvent
type Tag = String
type Text = String
abstract class XmlParser[F[_]: Input] {
import XmlParser._
val input = Input[F]
import input._
type Next[A] = XML =>> A
lazy val xml: Next[XML] = next[XML] >>= {
case EvText(x) if x.matches("\\s*") => xml
case x => ret(x)
def text(t: Text = ""): Next[Text] = xml >>= {
case EvText(x) => text(t + x)
case EvEntityRef(x) => text(t + "&" + x + ";")
case _ => ret(t)
def tag[A](gs: Tag*)(a: => Next[A])(
f: Tag => Next[A]): Next[A] = xml >>= {
case EvElemStart(_, g, _, _) if gs.contains(g) => f(g)
case _ => a
def on[A](g: Tag*)(a: => Next[A]) =
tag(g: _*)(empty[XML, A])(_ => a)
abstract class SiteMap[F[_]: Input]() extends XmlParser[F] {
import XmlParser._
import SiteMap._
import input._
def apply(i: F[XML]): F[URL]
val url = on("url")(attrs())
val urlset = on("urlset")(url)
def attrs(u: URL = emptyURL): Next[URL] =
tag("loc", "lastmod", "changefreq", "priority")(ret(u)) {
g => text() >>= { x => attrs(u + (g -> x)) }
object SiteMap {
import XmlParser._
type URL = Map[Tag, Text]
val emptyURL = Map[Tag, Text]()
def apply[F[_]: SiteMap](f: F[XML]): F[URL] =
implicit val iterator = new SiteMap[Iterator] {
def urls(i: Iterator[XML]) = Iterator.continually(
override def apply(i: Iterator[XML]): Iterator[URL] =
urlset(i).iterator.flatMap {
case (n, u) => Iterator(u) ++ urls(n)
implicit val stream = new SiteMap[Stream] {
type Parser[A] = Stream[XML] => Stream[A]
def parser[A](a: => Next[A])(
f: => Parser[A]): Parser[A] =
a(_).fold(Stream.empty[A]) {
case (s, u) => Stream.cons(u, f(s))
val urls: Parser[URL] = parser(url)(urls)
val sitemap = parser(urlset)(urls)
override def apply(s: Stream[XML]): Stream[URL] = sitemap(s)
Copy link

object SitemapTest extends App {

  val header = """<?xml version="1.0" encoding="UTF-8"?>
      <urlset xmlns="">"""

  val body = """<url>

  val footer = "</urlset>"

    val input = header + body + footer
    def src = new XMLEventReader(io.Source.fromString(input))

    case class Loop[A](a: Iterable[A]) extends Iterator[A] {
      def hasNext: Boolean = true
      var i = a.iterator
      def next(): A = {
        if (!i.hasNext) i = a.iterator

    val input = new Iterable[Char]() {
      def iterator = header.iterator ++ Loop(body)

    def src = new XMLEventReader(io.Source.fromIterable(input))

    def s = SiteMap(src.toStream)
    s.foreach { x =>
      println("# total memory : " + Runtime.getRuntime.totalMemory() / 1024)
      println("# free memory : " + Runtime.getRuntime.freeMemory() / 1024)

    val i = SiteMap(src.toIterator)

    while (i.hasNext) {
      println("# total memory : " + Runtime.getRuntime.totalMemory() / 1024)
      println("# free memory : " + Runtime.getRuntime.freeMemory() / 1024)



