Created
August 27, 2014 19:04
-
-
Save Fabszn/f1ebea63b776de6b8fd5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package utils | |
import rapture.io._ | |
import org.jsoup.Jsoup | |
import akka.actor.Actor | |
import scala.language.implicitConversions | |
import java.net.MalformedURLException | |
/** | |
* User: fsznajderman | |
* Date: 09/11/2013 | |
* Time: 23:57 | |
*/ | |
object WebCrawler { | |
import strategy.captureExceptions | |
case class Site(url: String, id: String) | |
implicit def parseUri(url: String): HttpUrl = | |
Http.parse(url) match { | |
case Left(e) => throw new MalformedURLException(url) | |
case Right(url) => url | |
} | |
def simpleCrawl(uri: String, analyser: String => Option[String]): Option[String] = analyser((doHttpQuery(uri)(None)(None)).getOrElse("")) | |
def complexCrawl(uri: String, analyser: String => Option[String]): Option[String] = analyser((doHttpQuery(uri)(Some(doHttpQuery(uri)(None)))(Some(new Browser))).getOrElse("")) | |
/** | |
* Do HttpQuery (POST) | |
* @param url | |
* @return | |
*/ | |
private def doHttpQuery(url: HttpUrl)(otherQuery: Option[Option[Browser] => Option[String]] = None)(browser: Option[Browser] = None): Option[String] = { | |
val b = browser.getOrElse(new Browser()) | |
b(url) post("", None, Map()) match { | |
case Left(i) => print(i.getMessage); None | |
case Right(result) => otherQuery match { | |
case None => Some(result.slurp[Char].toString) | |
case Some(query) => query(Some(b)) | |
} | |
} | |
} | |
object WebAnalyzer { | |
def defaultAnalyser(id: String)(content: String) = { | |
val doc = Jsoup.parse(content) | |
doc match { | |
case null => None | |
case _ => doc.getElementById(id) match { | |
case null => None | |
case a => Option(a.html()) | |
} | |
} | |
} | |
} | |
/** | |
* Web Crawler Actor | |
*/ | |
class WebCrawlerActor extends Actor { | |
def receive = { | |
case Site(url, id) => println(s"for id : $id => " + WebCrawler.complexCrawl(url, WebAnalyzer.defaultAnalyser(id)).getOrElse(s"Content not found for id : $id")) | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment