Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
HTML Parsing in Clojure using HtmlCleaner.
(ns in.grok.history.html-parser
(:require [clojure.contrib.logging :as log])
(:import [org.htmlcleaner HtmlCleaner]
[org.apache.commons.lang StringEscapeUtils]))
(defn parse-page
"Given the HTML source of a web page, parses it and returns the :title
and the tag-stripped :content of the page. Does not do any encoding
detection, it is expected that this has already been done."
[page-src]
(try
(when page-src
(let [cleaner (new HtmlCleaner)]
(doto (.getProperties cleaner) ;; set HtmlCleaner properties
(.setOmitComments true)
(.setPruneTags "script,style"))
(when-let [node (.clean cleaner page-src)]
{:title (when-let [title (.findElementByName node "title", true)]
(-> title
(.getText)
(str)
(StringEscapeUtils/unescapeHtml)))
:content (-> node
(.getText)
(str)
(StringEscapeUtils/unescapeHtml))})))
(catch Exception e
(log/error "Error when parsing" e))))
(defproject project-name "0.1"
:description "."
:dependencies [[org.clojure/clojure "1.1.0"]
[org.clojure/clojure-contrib "1.1.0"]
[org.clojars.sids/htmlcleaner "2.1"]
[commons-lang "2.5"]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment