Last active
August 29, 2015 13:56
-
-
Save rxacevedo/c79c5813f799a6de7cf9 to your computer and use it in GitHub Desktop.
Scraping via XPath
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; xalan 2.6 gets put on the classpath when using | |
;; Incanter, causes issues with clj-xpath. So DON'T | |
;; use Incanter when using clj-xpath. | |
(->> (ClassLoader/getSystemClassLoader) | |
(.getURLs) | |
(map #(.toString %)) | |
(clojure.string/join "\n") | |
(re-seq #".*xalan.*") | |
(pprint)) | |
;; Import stuff | |
(do | |
(require '(clj-http (client :as c))) | |
(require '(clj-xpath (core :refer :all)))) | |
;; Memoize to avoid repeatedly fetching | |
(def my-rss-xml | |
(memoize (fn [] (-> (c/get "http://robertoacevedo.net/atom.xml") :body)))) | |
;; Extract some awesomeness! | |
(let [titles ($x:text* "//entry/title" (my-rss-xml)) | |
urls ($x:text* "//entry/id" (my-rss-xml)) | |
updated ($x:text* "//entry/updated" (my-rss-xml))] | |
(->> (map vector titles urls updated) | |
(map (fn [[t u utime]] {:title t :url u :updated utime})) | |
(pprint))) | |
; Outputs.. | |
; | |
; ({:title "Sierpinski Triangle Fractal", | |
; :url | |
; "http://rxacevedo.github.io/blog/2014/02/15/sierpinski-triangle-fractal", | |
; :updated "2014-02-15T22:36:06-05:00"} | |
; {:title "Approximating the Golden Ratio", | |
; :url | |
; "http://rxacevedo.github.io/blog/2014/02/09/approximating-the-golden-ratio", | |
; :updated "2014-02-09T12:47:20-05:00"} | |
; {:title "Dynamic Binding and Being Meta", | |
; :url | |
; "http://rxacevedo.github.io/blog/2014/02/07/dynamic-binding-and-being-meta", | |
; :updated "2014-02-07T17:47:38-05:00"} | |
; {:title "Predicting Algorithm Running Times", | |
; :url | |
; "http://rxacevedo.github.io/blog/2014/01/26/predicting-algorithm-running-times", | |
; :updated "2014-01-26T19:47:29-05:00"} | |
; {:title "First-class Functions", | |
; :url | |
; "http://rxacevedo.github.io/blog/2014/01/07/first-class-functions", | |
; :updated "2014-01-07T19:25:44-05:00"} | |
; {:title "Scala and Clojure List Operations", | |
; :url | |
; "http://rxacevedo.github.io/blog/2013/12/18/scala-and-clojure-list-operations", | |
; :updated "2013-12-18T19:32:00-05:00"} | |
; {:title "A Tale of Two Languages", | |
; :url | |
; "http://rxacevedo.github.io/blog/2013/10/20/a-tale-of-two-languages", | |
; :updated "2013-10-20T10:51:00-04:00"} | |
; {:title "Recursion in Scala", | |
; :url "http://rxacevedo.github.io/blog/2013/04/08/recursion-in-scala", | |
; :updated "2013-04-08T19:33:00-04:00"} | |
; {:title "Multi-threaded Socket Server", | |
; :url "http://rxacevedo.github.io/blog/2012/12/03/socket-server", | |
; :updated "2012-12-03T11:34:00-05:00"}) | |
;; Similar approach for HackerNews | |
(def hackernews-xml | |
(memoize (fn [] (-> (c/get "http://news.ycombinator.com/rss") :body)))) | |
;; Same | |
(let [titles ($x:text* "rss//item/title" (hackernews-xml)) | |
links ($x:text* "rss//item/link" (hackernews-xml)) | |
comments ($x:text* "rss//item/comments" (hackernews-xml))] | |
(->> (map vector titles links comments) | |
(map (fn [[t l c]] {:title t :url l :comments c})) | |
(pprint))) | |
; Outputs.. | |
; | |
; ({:title "The future of Fiber", | |
; :url "https://fiber.google.com/newcities/", | |
; :comments "https://news.ycombinator.com/item?id=7265143"} | |
; {:title "WebGL Water", | |
; :url "http://madebyevan.com/webgl-water/", | |
; :comments "https://news.ycombinator.com/item?id=7264103"} | |
; {:title "How Microryza Acquired the Domain Experiment.com", | |
; :url | |
; "http://priceonomics.com/how-microryza-acquired-the-domain-experimentcom/", | |
; :comments "https://news.ycombinator.com/item?id=7265540"} | |
; {:title "How I was able to track the location of any Tinder user", | |
; :url | |
; "http://blog.includesecurity.com/2014/02/how-i-was-able-to-track-location-of-any.html", | |
; :comments "https://news.ycombinator.com/item?id=7265220"} | |
; {:title | |
; "Canonical announces first partners to ship Ubuntu phones around the globe", | |
; :url | |
; "http://insights.ubuntu.com/news/canonical-announces-first-partners-to-ship-ubuntu-phones-around-the-globe/", | |
; :comments "https://news.ycombinator.com/item?id=7264573"} | |
; {:title "This App Trains You to See Farther", | |
; :url | |
; "http://www.popularmechanics.com/_mobile/science/health/med-tech/this-app-trains-you-to-see-farther-16506910", | |
; :comments "https://news.ycombinator.com/item?id=7266233"} | |
; {:title | |
; "Heap's new interface for analytics: clicking around your site", | |
; :url "http://blog.heapanalytics.com/the-event-visualizer/", | |
; :comments "https://news.ycombinator.com/item?id=7265039"}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment