Skip to content

Instantly share code, notes, and snippets.

Created September 17, 2010 12:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yuroyoro/584191 to your computer and use it in GitHub Desktop.
Save yuroyoro/584191 to your computer and use it in GitHub Desktop.
import{Codec, Source}
import scala.collection.{Iterable, IterableLike}
trait ScrapedHtml extends Iterable[String] with IterableLike[String, ScrapedHtml]{
val src:Iterable[String]
import scala.collection.generic.CanBuildFrom
import scala.collection.mutable.{ListBuffer, Builder}
def newTo(from:List[String]):ScrapedHtml
def iterator = src.iterator
override def newBuilder:Builder[String, ScrapedHtml] = new ListBuffer[String] mapResult {x => newTo(x) }
implicit def canBuildFrom: CanBuildFrom[ScrapedHtml, String, ScrapedHtml] = new CanBuildFrom[ScrapedHtml, String, ScrapedHtml] {
def apply(from: ScrapedHtml):Builder[String, ScrapedHtml] = newBuilder
def apply() = newBuilder
def write( fileName:String ):Unit = {
import scala.util.control.Exception._
new BufferedWriter(new FileWriter(fileName))
}.foreach{ bw =>
allCatch.andFinally{ bw.close } {
bw.write( src.mkString(System.getProperty("line.separator")))
def parse:ScrapedHtml
case class RawHtml(src:Iterable[String]) extends ScrapedHtml {
def newTo(from:List[String]) = RawHtml(from)
def parse =
ParsedHtml({ _.replaceAll("""<.+?>|\t""", "") }.filter{ _.nonEmpty })
case class ParsedHtml(src:Iterable[String]) extends ScrapedHtml{
def newTo(from:List[String]) = ParsedHtml(from)
def parse:ScrapedHtml = this
object HtmlScraper {
def apply(url:String):ScrapedHtml = RawHtml(getSource(url).getLines.toSeq)
def getSource(url: String ) = {
val in = new URL(url).openStream
val buf = Stream.continually{ }.takeWhile{ -1 != }.map{ _.byteValue}.toArray
implicit val codec = {
val Charset = """.*content.*charset\s*=\s*([0-9a-z|\-|_]+).*""".r
val pf:PartialFunction[String, Codec] = { case Charset(cs) => cs }
pf.isDefinedAt }.collect{ pf }.getOrElse{ Codec.default }
def download(url:String, fileName:String, toParse:Boolean = true ):Unit = {
val html = if(toParse) HtmlScraper(url).parse else HtmlScraper(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment