Created
October 25, 2015 23:05
-
-
Save sergey-scherbina/9766f0c213ed1afc5330 to your computer and use it in GitHub Desktop.
Lazy parser for site maps XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.xml.pull._ | |
import scalaz._ | |
import Scalaz._ | |
trait Input[F[_]] { | |
type =>>[A, B] = StateT[Option, F[A], B] | |
def uncons[A](f: F[A]): Option[(F[A], A)] | |
def next[A]: (A =>> A) = StateT { uncons } | |
def none[A, B] = Option.empty[(F[A], B)] | |
def empty[A, B]: (A =>> B) = StateT { _ => none } | |
def ret[A, B](b: B): (A =>> B) = | |
StateT { f => Option(f -> b) } | |
} | |
object Input { | |
def apply[F[_]: Input] = implicitly[Input[F]] | |
implicit val iterator = new Input[Iterator] { | |
override def uncons[A]( | |
i: Iterator[A]): Option[(Iterator[A], A)] = | |
if (i.hasNext) Option(i.next()) map (i -> _) | |
else none | |
} | |
implicit val stream = new Input[Stream] { | |
override def uncons[A]( | |
s: Stream[A]): Option[(Stream[A], A)] = | |
s.headOption map (s.tail -> _) | |
} | |
} | |
object XmlParser { | |
type XML = XMLEvent | |
type Tag = String | |
type Text = String | |
} | |
abstract class XmlParser[F[_]: Input] { | |
import XmlParser._ | |
val input = Input[F] | |
import input._ | |
type Next[A] = XML =>> A | |
lazy val xml: Next[XML] = next[XML] >>= { | |
case EvText(x) if x.matches("\\s*") => xml | |
case x => ret(x) | |
} | |
def text(t: Text = ""): Next[Text] = xml >>= { | |
case EvText(x) => text(t + x) | |
case EvEntityRef(x) => text(t + "&" + x + ";") | |
case _ => ret(t) | |
} | |
def tag[A](gs: Tag*)(a: => Next[A])( | |
f: Tag => Next[A]): Next[A] = xml >>= { | |
case EvElemStart(_, g, _, _) if gs.contains(g) => f(g) | |
case _ => a | |
} | |
def on[A](g: Tag*)(a: => Next[A]) = | |
tag(g: _*)(empty[XML, A])(_ => a) | |
} | |
abstract class SiteMap[F[_]: Input]() extends XmlParser[F] { | |
import XmlParser._ | |
import SiteMap._ | |
import input._ | |
def apply(i: F[XML]): F[URL] | |
val url = on("url")(attrs()) | |
val urlset = on("urlset")(url) | |
def attrs(u: URL = emptyURL): Next[URL] = | |
tag("loc", "lastmod", "changefreq", "priority")(ret(u)) { | |
g => text() >>= { x => attrs(u + (g -> x)) } | |
} | |
} | |
object SiteMap { | |
import XmlParser._ | |
type URL = Map[Tag, Text] | |
val emptyURL = Map[Tag, Text]() | |
def apply[F[_]: SiteMap](f: F[XML]): F[URL] = | |
implicitly[SiteMap[F]].apply(f) | |
implicit val iterator = new SiteMap[Iterator] { | |
def urls(i: Iterator[XML]) = Iterator.continually( | |
url.eval(i)).takeWhile(_.isDefined).map(_.get) | |
override def apply(i: Iterator[XML]): Iterator[URL] = | |
urlset(i).iterator.flatMap { | |
case (n, u) => Iterator(u) ++ urls(n) | |
} | |
} | |
implicit val stream = new SiteMap[Stream] { | |
type Parser[A] = Stream[XML] => Stream[A] | |
def parser[A](a: => Next[A])( | |
f: => Parser[A]): Parser[A] = | |
a(_).fold(Stream.empty[A]) { | |
case (s, u) => Stream.cons(u, f(s)) | |
} | |
val urls: Parser[URL] = parser(url)(urls) | |
val sitemap = parser(urlset)(urls) | |
override def apply(s: Stream[XML]): Stream[URL] = sitemap(s) | |
} | |
} |
Author
sergey-scherbina
commented
Oct 25, 2015
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment