Skip to content

Instantly share code, notes, and snippets.

@douglarek
Forked from owainlewis/Scrape.scala
Created January 21, 2017 15:34
Show Gist options
  • Save douglarek/d0882818fc805cf3c376f73bbbd825b6 to your computer and use it in GitHub Desktop.
Save douglarek/d0882818fc805cf3c376f73bbbd825b6 to your computer and use it in GitHub Desktop.
libraryDependencies ++= Seq(
"org.jsoup" % "jsoup" % "1.6.1"
)
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import scala.collection.JavaConversions._
import java.net.{ URL, MalformedURLException }
import scala.util.control.Exception._
sealed case class Link(title: String, href: String)
case class WebDocument(title: String,
body: String,
links: Seq[Link],
metaDescription: String)
object Crawler {
type JDoc = org.jsoup.nodes.Document
def get(url: String): JDoc = Jsoup.connect(url).get()
def titleText(doc: JDoc): String = doc.select("title").text
def bodyText(doc: JDoc): String = doc.select("body").text
/**
* Allows for extraction without null pointer exceptions
*
*/
def safeMetaExtract(doc: JDoc, meta: String): String = {
val result = doc.select("meta[name=" ++ meta ++ "]").first
Option(result) match {
case Some(v) => v.attr("content")
case None => ""
}
}
def metaKeywords(doc: JDoc): String = safeMetaExtract(doc, "keywords")
def metaDescription(doc: JDoc): String = safeMetaExtract(doc, "description")
/**
* Extracts links from a document
*
*/
def linkSequence(doc: JDoc): Seq[Link] = {
val links = doc.select("a[href]").iterator.toList
links.map { l => Link(l.text, l.attr("href")) }
}
def extract(doc: JDoc): WebDocument = {
val title: String = titleText(doc)
val body: String = bodyText(doc)
val links: Seq[Link] = linkSequence(doc)
val desc: String = metaDescription(doc)
WebDocument(title, body, links, desc)
}
def safeURL(url: String): Option[String] = {
val result = catching(classOf[MalformedURLException]) opt new URL(url)
result match {
case Some(v) => Some(v.toString)
case None => None
}
}
/**
* Crawl a URL and return a WebDocument
*
*/
def crawl(url: String): WebDocument = {
val f = extract _ compose get
f(url)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment