Skip to content

Instantly share code, notes, and snippets.

@paulkoerbitz
Created October 18, 2011 11:35
Show Gist options
  • Save paulkoerbitz/1295222 to your computer and use it in GitHub Desktop.
Save paulkoerbitz/1295222 to your computer and use it in GitHub Desktop.
Simple webscrape with enlive
(ns tutorial.scrape1
(:require [net.cgrand.enlive-html :as html]))
(def *url* "http://www.belex.rs/trgovanje/prospekt/VZAS/show")
(defn get-page
"Gets the html page from passed url"
[url]
(html/html-resource (java.net.URL. url)))
(defn content->string [content]
(cond
(nil? content) ""
(string? content) content
(map? content) (content->string (:content content))
(coll? content) (apply str (map content->string content))
:else (str content)))
(derive clojure.lang.PersistentStructMap ::Map)
(derive clojure.lang.PersistentArrayMap ::Map)
(derive java.lang.String ::String)
(derive clojure.lang.ISeq ::Collection)
(derive clojure.lang.PersistentList ::Collection)
(derive clojure.lang.LazySeq ::Collection)
(defn tag-type [node]
(case (:tag node)
:tr ::CompoundNode
:table ::CompoundNode
:th ::TerminalNode
:td ::TerminalNode
:h3 ::TerminalNode
:tbody ::IgnoreNode
::IgnoreNode))
(defmulti parse-node
(fn [node]
(let [cls (class node)] [cls (if (isa? cls ::Map) (tag-type node) nil)])))
(defmethod parse-node [::Map ::TerminalNode] [node]
(content->string (:content node)))
(defmethod parse-node [::Map ::CompoundNode] [node]
(map parse-node (:content node)))
(defmethod parse-node [::Map ::IgnoreNode] [node]
(parse-node (:content node)))
(defmethod parse-node [::String nil] [node]
node)
(defmethod parse-node [::Collection nil] [node]
(map parse-node node))
(defn h3+table
"returns sequence of <h3> and <table> tags"
[url]
(let [ws-content (get-page url)
h3s+tables (html/select ws-content #{[:div#prospekt_container :h3]
[:div#prospekt_container :table]})]
(for [node h3s+tables] (parse-node node))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment