-
-
Save devn/bdc2f0729ea6d478ec08 to your computer and use it in GitHub Desktop.
yokogiri-gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns yokogiri.core | |
(:import [com.gargoylesoftware.htmlunit WebClient BrowserVersion] | |
[com.gargoylesoftware.htmlunit.html HtmlPage])) | |
(comment ;; Development Switches | |
(set! *warn-on-reflection* true)) | |
(defn make-client [] | |
(new WebClient)) | |
(defn visit [^WebClient c, | |
^String url] | |
(. c getPage url)) | |
(defn xpath [^HtmlPage page, | |
^String xpath] | |
(. page getByXPath xpath)) | |
(defn node-xml [node] | |
(.asXml node)) | |
(defn node-text [node] | |
(.asText node)) | |
(defn attrs [node] | |
(let [attrs (.getAttributes node)] | |
(loop [acc 0, res {}] | |
(if (= acc (count attrs)) | |
res | |
(recur (inc acc) | |
(let [attr (.item attrs acc)] | |
(assoc res (keyword (.getName attr)) (.getValue attr)))))))) | |
(comment ;; TODO: Add arity | |
(def browser-versions | |
{"IE6" BrowserVersion/INTERNET_EXPLORER_6 | |
"IE7" BrowserVersion/INTERNET_EXPLORER_7 | |
"IE8" BrowserVersion/INTERNET_EXPLORER_8 | |
"FF3" BrowserVersion/FIREFOX_3}) | |
(defn make-client | |
([] (make-client (.getNickname (BrowserVersion/getDefault)))) | |
([version] (let [vers ((.toUpperCase version) browser-versions)] | |
(new WebClient ,,,)))) | |
) | |
(comment ;; Trying something else instead... | |
(ns yokogiri.core | |
(:import [org.cyberneko.html.parsers DOMParser] | |
[org.w3c.dom HTMLDocumentImpl])) | |
(def parser | |
(new DOMParser)) | |
(def parsed-url | |
(. parser parse "http://clojure-log.n01se.net/date/2008-02-01.html")) | |
(def document | |
(. parser getDocument)) | |
(defn get-element-by-id | |
[^org.apache.html.dom.HTMLDocumentImpl doc ^String id] | |
(. doc getElementById id)) | |
(defn get-elements-by-tag-name | |
[^org.apache.html.dom.HTMLDocumentImpl doc ^String tag] | |
(. doc getElementsByTagName tag)) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment