Skip to content

Instantly share code, notes, and snippets.

@sergey-scherbina
Created October 25, 2015 23:05
Show Gist options
  • Save sergey-scherbina/9766f0c213ed1afc5330 to your computer and use it in GitHub Desktop.
Save sergey-scherbina/9766f0c213ed1afc5330 to your computer and use it in GitHub Desktop.
Lazy parser for site maps XML
import scala.xml.pull._
import scalaz._
import Scalaz._
trait Input[F[_]] {
type =>>[A, B] = StateT[Option, F[A], B]
def uncons[A](f: F[A]): Option[(F[A], A)]
def next[A]: (A =>> A) = StateT { uncons }
def none[A, B] = Option.empty[(F[A], B)]
def empty[A, B]: (A =>> B) = StateT { _ => none }
def ret[A, B](b: B): (A =>> B) =
StateT { f => Option(f -> b) }
}
object Input {
def apply[F[_]: Input] = implicitly[Input[F]]
implicit val iterator = new Input[Iterator] {
override def uncons[A](
i: Iterator[A]): Option[(Iterator[A], A)] =
if (i.hasNext) Option(i.next()) map (i -> _)
else none
}
implicit val stream = new Input[Stream] {
override def uncons[A](
s: Stream[A]): Option[(Stream[A], A)] =
s.headOption map (s.tail -> _)
}
}
object XmlParser {
type XML = XMLEvent
type Tag = String
type Text = String
}
abstract class XmlParser[F[_]: Input] {
import XmlParser._
val input = Input[F]
import input._
type Next[A] = XML =>> A
lazy val xml: Next[XML] = next[XML] >>= {
case EvText(x) if x.matches("\\s*") => xml
case x => ret(x)
}
def text(t: Text = ""): Next[Text] = xml >>= {
case EvText(x) => text(t + x)
case EvEntityRef(x) => text(t + "&" + x + ";")
case _ => ret(t)
}
def tag[A](gs: Tag*)(a: => Next[A])(
f: Tag => Next[A]): Next[A] = xml >>= {
case EvElemStart(_, g, _, _) if gs.contains(g) => f(g)
case _ => a
}
def on[A](g: Tag*)(a: => Next[A]) =
tag(g: _*)(empty[XML, A])(_ => a)
}
abstract class SiteMap[F[_]: Input]() extends XmlParser[F] {
import XmlParser._
import SiteMap._
import input._
def apply(i: F[XML]): F[URL]
val url = on("url")(attrs())
val urlset = on("urlset")(url)
def attrs(u: URL = emptyURL): Next[URL] =
tag("loc", "lastmod", "changefreq", "priority")(ret(u)) {
g => text() >>= { x => attrs(u + (g -> x)) }
}
}
object SiteMap {
import XmlParser._
type URL = Map[Tag, Text]
val emptyURL = Map[Tag, Text]()
def apply[F[_]: SiteMap](f: F[XML]): F[URL] =
implicitly[SiteMap[F]].apply(f)
implicit val iterator = new SiteMap[Iterator] {
def urls(i: Iterator[XML]) = Iterator.continually(
url.eval(i)).takeWhile(_.isDefined).map(_.get)
override def apply(i: Iterator[XML]): Iterator[URL] =
urlset(i).iterator.flatMap {
case (n, u) => Iterator(u) ++ urls(n)
}
}
implicit val stream = new SiteMap[Stream] {
type Parser[A] = Stream[XML] => Stream[A]
def parser[A](a: => Next[A])(
f: => Parser[A]): Parser[A] =
a(_).fold(Stream.empty[A]) {
case (s, u) => Stream.cons(u, f(s))
}
val urls: Parser[URL] = parser(url)(urls)
val sitemap = parser(urlset)(urls)
override def apply(s: Stream[XML]): Stream[URL] = sitemap(s)
}
}
@sergey-scherbina
Copy link
Author

object SitemapTest extends App {

  val header = """<?xml version="1.0" encoding="UTF-8"?>
      <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">"""

  val body = """<url>
      <loc>http://www.example.com/</loc>
      <lastmod>2005-01-01</lastmod>
      <changefreq>monthly</changefreq>
      <priority>0.8</priority>
   </url>
   <url>
      <loc>http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc>
      <changefreq>weekly</changefreq>
   </url>
   <url>
      <loc>http://www.example.com/catalog?item=73&desc=vacation_new_zealand</loc>
      <lastmod>2004-12-23</lastmod>
      <changefreq>weekly</changefreq>
   </url>
   <url>
      <loc>http://www.example.com/catalog?item=74&desc=vacation_newfoundland</loc>
      <lastmod>2004-12-23T18:00:15+00:00</lastmod>
      <priority>0.3</priority>
   </url>
   <url>
      <loc>http://www.example.com/catalog?item=83&desc=vacation_usa</loc>
      <lastmod>2004-11-23</lastmod>
   </url>"""

  val footer = "</urlset>"

  {
    val input = header + body + footer
    def src = new XMLEventReader(io.Source.fromString(input))
    SiteMap(src.toIterator).foreach(println)
    SiteMap(src.toStream).foreach(println)
  }

  {
    case class Loop[A](a: Iterable[A]) extends Iterator[A] {
      def hasNext: Boolean = true
      var i = a.iterator
      def next(): A = {
        if (!i.hasNext) i = a.iterator
        i.next()
      }
    }

    val input = new Iterable[Char]() {
      def iterator = header.iterator ++ Loop(body)
    }

    def src = new XMLEventReader(io.Source.fromIterable(input))

    def s = SiteMap(src.toStream)
    s.foreach { x =>
      println("# total memory : " + Runtime.getRuntime.totalMemory() / 1024)
      println("# free memory : " + Runtime.getRuntime.freeMemory() / 1024)
      println(x)
    }

    val i = SiteMap(src.toIterator)

    while (i.hasNext) {
      println("# total memory : " + Runtime.getRuntime.totalMemory() / 1024)
      println("# free memory : " + Runtime.getRuntime.freeMemory() / 1024)
      println(i.next())
    }

  }

}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment