Skip to content

Instantly share code, notes, and snippets.

@Fabszn
Created August 27, 2014 19:04
Show Gist options
  • Save Fabszn/f1ebea63b776de6b8fd5 to your computer and use it in GitHub Desktop.
Save Fabszn/f1ebea63b776de6b8fd5 to your computer and use it in GitHub Desktop.
package utils
import rapture.io._
import org.jsoup.Jsoup
import akka.actor.Actor
import scala.language.implicitConversions
import java.net.MalformedURLException
/**
* User: fsznajderman
* Date: 09/11/2013
* Time: 23:57
*/
object WebCrawler {
import strategy.captureExceptions
case class Site(url: String, id: String)
implicit def parseUri(url: String): HttpUrl =
Http.parse(url) match {
case Left(e) => throw new MalformedURLException(url)
case Right(url) => url
}
def simpleCrawl(uri: String, analyser: String => Option[String]): Option[String] = analyser((doHttpQuery(uri)(None)(None)).getOrElse(""))
def complexCrawl(uri: String, analyser: String => Option[String]): Option[String] = analyser((doHttpQuery(uri)(Some(doHttpQuery(uri)(None)))(Some(new Browser))).getOrElse(""))
/**
* Do HttpQuery (POST)
* @param url
* @return
*/
private def doHttpQuery(url: HttpUrl)(otherQuery: Option[Option[Browser] => Option[String]] = None)(browser: Option[Browser] = None): Option[String] = {
val b = browser.getOrElse(new Browser())
b(url) post("", None, Map()) match {
case Left(i) => print(i.getMessage); None
case Right(result) => otherQuery match {
case None => Some(result.slurp[Char].toString)
case Some(query) => query(Some(b))
}
}
}
object WebAnalyzer {
def defaultAnalyser(id: String)(content: String) = {
val doc = Jsoup.parse(content)
doc match {
case null => None
case _ => doc.getElementById(id) match {
case null => None
case a => Option(a.html())
}
}
}
}
/**
* Web Crawler Actor
*/
class WebCrawlerActor extends Actor {
def receive = {
case Site(url, id) => println(s"for id : $id => " + WebCrawler.complexCrawl(url, WebAnalyzer.defaultAnalyser(id)).getOrElse(s"Content not found for id : $id"))
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment