Skip to content

Instantly share code, notes, and snippets.

@tyrcho
Last active February 3, 2017 13:13
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tyrcho/4579023 to your computer and use it in GitHub Desktop.
Save tyrcho/4579023 to your computer and use it in GitHub Desktop.
Processing HTML with Scala as if XML
import java.net.URL
import scala.xml.XML
import org.xml.sax.InputSource
import scala.xml.parsing.NoBindingFactoryAdapter
import org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl
import java.net.HttpURLConnection
import scala.xml.Node
object HTML {
lazy val adapter = new NoBindingFactoryAdapter
lazy val parser = (new SAXFactoryImpl).newSAXParser
def load(url: URL, headers: Map[String, String] = Map.empty): Node = {
val conn = url.openConnection().asInstanceOf[HttpURLConnection]
for ((k, v) <- headers)
conn.setRequestProperty(k, v)
val source = new InputSource(conn.getInputStream)
adapter.loadXML(source, parser)
}
}
import java.net.URL
val site = new URL("http://michel-daviot.blogspot.fr/")
val content = HTML.load(site)
for (
a <- content \\ "a";
href = a.attribute("href");
if href.isDefined
) println(href.get)
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<version>1.2.1</version>
</dependency>
object SetProxy {
def apply(proxyConfig: (String, Int)) {
val (host, port) = proxyConfig
for (protocol <- Seq("http", "https")) {
System.setProperty(s"$protocol.proxyPort", port.toString)
System.setProperty(s"$protocol.proxyHost", host)
}
}
}
import java.net.URL
import scala.xml.XML
val site = new URL("http://michel-daviot.blogspot.fr/")
XML.load(site)
//Exception in thread "main" org.xml.sax.SAXParseException; lineNumber: 7; columnNumber: 265;
//The entity name must immediately follow the '&' in the entity reference.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment