Skip to content

Instantly share code, notes, and snippets.

@pithyless
Last active September 7, 2020 14:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pithyless/5ce137958be0c6a8411b403885c99665 to your computer and use it in GitHub Desktop.
Save pithyless/5ce137958be0c6a8411b403885c99665 to your computer and use it in GitHub Desktop.
JSoup clojure snippet
(ns jsoup-scraping.core
(:gen-class)
(:require [clojure.string :as str]
[clojure.data.csv :as csv]
[clojure.java.io :as io])
(:import org.jsoup.Jsoup))
(def newegg-url "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH")
(defn fetch-page [url]
(.get (Jsoup/connect url)))
(defn dom-elements [root selector]
(.select root selector))
(defn dom-text [root selector]
(-> (dom-elements root selector)
(.text)))
(defn dom-attr [root selector attribute]
(-> (dom-elements root selector)
(.attr attribute)))
(defn parse-product [item]
(let [product-name (-> (dom-text item ".item-title")
(str/replace "," "|"))
brand (-> (dom-attr item "div.item-branding > a > img" "title"))
shipping (-> (dom-text item "div.item-action ul.price > li.price-ship")
(str/replace "$" "")
(str/replace "Shipping" ""))]
[brand product-name shipping]))
(def csv-headers
["brand" "product_name" "shipping"])
(defn generate-csv [filename url]
(let [html (fetch-page url)
csv-rows (map parse-product (dom-elements html "div.item-container"))]
(with-open [writer (io/writer filename)]
(csv/write-csv writer (cons csv-headers csv-rows)))))
(defn -main []
(generate-csv "test-graphics.csv" newegg-url))
(comment
(def html (fetch-page newegg-url))
(def items (dom-elements html "div.item-container"))
(dom-text (first items) ".item-title")
(dom-attr (first items) ".item-branding > a > img" "title")
(parse-product (first items))
(generate-csv "graphics_cards.csv" newegg-url)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment