Skip to content

Instantly share code, notes, and snippets.

@ponkore
Created December 14, 2013 13:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ponkore/7958993 to your computer and use it in GitHub Desktop.
Save ponkore/7958993 to your computer and use it in GitHub Desktop.
Clojure で HTML スクレイピングしてみる ref: http://qiita.com/ponkore/items/5309023186353de49172
[net.sourceforge.htmlcleaner/htmlcleaner "2.2"]
(defn html->node
[cleaner html-src]
(doto (.getProperties cleaner)
(.setOmitComments true) ;; HTML のコメントは無視する
(.setPruneTags "script,style") ;; <script>, <style> タグは無視する
(.setOmitXmlDeclaration true))
(.clean cleaner html-src)) ;; cleaner.clean(string) でパース
(html->node (HtmlCleaner.) page-src)
;; => node オブジェクト(org.htmlcleaner.TagNode)
(defn node->xml
[cleaner node]
(let [props (.getProperties cleaner)
xml-serializer (CompactXmlSerializer. props)]
(-> (.getAsString xml-serializer node) ;; node を XML の String に変換
java.io.StringReader.
org.xml.sax.InputSource.
xml/parse))) ;; clojure.xml/parse で Clojure 内部表現に変換
(defn test01
[url]
(let [cleaner (HtmlCleaner.)
page-src (slurp url)
node (html->node cleaner page-src)
xml (node->xml cleaner node)]
;; ここで xml の処理...
xml
))
(def x (test01 "http://qiita.com/advent-calendar/2013/lisp"))
x
;; => {:tag :html, :attrs nil, :content [{:tag :head, :attrs nil, :content [{:tag :meta, :attrs {:charset "UTF-8"}, :content nil} ...
(ns html-parser.core
(:require [clojure.xml :as xml])
(:import [org.htmlcleaner HtmlCleaner CompactXmlSerializer]))
;; 参考 https://gist.github.com/sids/391818
(defn html->node
[cleaner html-src]
(doto (.getProperties cleaner)
(.setOmitComments true) ;; HTML のコメントは無視する
(.setPruneTags "script,style") ;; <script>, <style> タグは無視する
(.setOmitXmlDeclaration true))
(.clean cleaner html-src)) ;; cleaner.clean(string) でパース
(defn node->xml
[cleaner node]
(let [props (.getProperties cleaner)
xml-serializer (CompactXmlSerializer. props)]
(-> (.getAsString xml-serializer node) ;; node を XML の String に変換
java.io.StringReader.
org.xml.sax.InputSource.
xml/parse))) ;; clojure.xml/parse で Clojure 内部表現に変換
(defn test01
[url]
(let [cleaner (HtmlCleaner.)
page-src (slurp url)
node (str->node cleaner page-src)
xml (node->xml cleaner node)]
;; ここで xml の処理...
xml
))
;; (def x (test01 "http://qiita.com/advent-calendar/2013/lisp"))
;; => ...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment