Skip to content

Instantly share code, notes, and snippets.

Last active January 21, 2017 15:34
Show Gist options
  • Save owainlewis/8003890 to your computer and use it in GitHub Desktop.
Save owainlewis/8003890 to your computer and use it in GitHub Desktop.
libraryDependencies ++= Seq(
"org.jsoup" % "jsoup" % "1.6.1"
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import scala.collection.JavaConversions._
import{ URL, MalformedURLException }
import scala.util.control.Exception._
sealed case class Link(title: String, href: String)
case class WebDocument(title: String,
body: String,
links: Seq[Link],
metaDescription: String)
object Crawler {
type JDoc = org.jsoup.nodes.Document
def get(url: String): JDoc = Jsoup.connect(url).get()
def titleText(doc: JDoc): String ="title").text
def bodyText(doc: JDoc): String ="body").text
* Allows for extraction without null pointer exceptions
def safeMetaExtract(doc: JDoc, meta: String): String = {
val result ="meta[name=" ++ meta ++ "]").first
Option(result) match {
case Some(v) => v.attr("content")
case None => ""
def metaKeywords(doc: JDoc): String = safeMetaExtract(doc, "keywords")
def metaDescription(doc: JDoc): String = safeMetaExtract(doc, "description")
* Extracts links from a document
def linkSequence(doc: JDoc): Seq[Link] = {
val links ="a[href]").iterator.toList { l => Link(l.text, l.attr("href")) }
def extract(doc: JDoc): WebDocument = {
val title: String = titleText(doc)
val body: String = bodyText(doc)
val links: Seq[Link] = linkSequence(doc)
val desc: String = metaDescription(doc)
WebDocument(title, body, links, desc)
def safeURL(url: String): Option[String] = {
val result = catching(classOf[MalformedURLException]) opt new URL(url)
result match {
case Some(v) => Some(v.toString)
case None => None
* Crawl a URL and return a WebDocument
def crawl(url: String): WebDocument = {
val f = extract _ compose get
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment