Skip to content

Instantly share code, notes, and snippets.

@turugina
Created January 14, 2011 06:59
Show Gist options
  • Save turugina/779296 to your computer and use it in GitHub Desktop.
Save turugina/779296 to your computer and use it in GitHub Desktop.
html scraping test /clojure
(ns htmlscrapetest
(:require [clojure.xml :as xml]
[clojure.zip :as zip]
[clojure.contrib.zip-filter :as z]
[clojure.contrib.zip-filter.xml :as zf])
(:use [clojure.java.io :only [as-file]])
; http://htmlcleaner.sourceforge.net/
(:import [org.htmlcleaner CleanerProperties CompactXmlSerializer HtmlCleaner])
)
(defn mytest [zipper]
(map #(zipmap '(:thumb_url :illust_id :title) %)
(map conj
(map #(vector
(java.net.URL. %)
(second (re-find (re-matcher #"/(\d+)_s\.(?:jpg|gif|png)" %))))
(zf/xml-> zipper z/descendants :ul :li :a :img (zf/attr :src)))
(zf/xml-> zipper z/descendants :ul :li :a :img (zf/attr :alt))
)))
(defn parse-html [f]
(let [cleaner (HtmlCleaner.)]
(doto (.getProperties cleaner)
(.setOmitComments true)
(.setPruneTags "script, style"))
(when-let [node (.clean cleaner (as-file f) "utf-8")]
(zip/xml-zip
(xml/parse
(org.xml.sax.InputSource.
(java.io.StringReader.
(.getAsString (CompactXmlSerializer. (.getProperties cleaner)) node "utf-8"))))))
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment