Skip to content

Instantly share code, notes, and snippets.

@cmiles74
Created September 15, 2009 19:29
Show Gist options
  • Save cmiles74/187551 to your computer and use it in GitHub Desktop.
Save cmiles74/187551 to your computer and use it in GitHub Desktop.
;;
;; Provides functions for managing links.
;;
(ns org.cooleydickinson.crawler.links
(:use [org.cooleydickinson.crawler.tagsoup :only (parse)])
(:import (java.net URL)))
(defn canonicize
"Returns a canonical (not relative) URL for the supplied URL. The
information used to canonicize the URL is copied from the supplied
URL object."
[url url-text]
;; make sure the url isn't already cannonical
(if (or
(= "http://" (apply str (take 7 url-text)))
(= "https://" (apply str (take 8 url-text))))
url-text
(str
(. url getProtocol)
"://"
(. url getHost)
;; only include the port if it's not 80 or 443
(if (and (< 0 (. url getPort))
(or (not= 80 (. url getPort))
(not= 443 (. url getPort))))
(str ":" (. url getPort)))
url-text)))
(defn links
"Returns a list of the unique canonical links that are referenced by
the content at the supplied URL."
[url-text]
;; create a URL for the passed in url-text
(let [url (new URL url-text)]
;; canonicize all of our URLs and return a unique list
(set
(map (partial canonicize url)
;; return a list of URLs referenced by the URL
;; (excluding javascript links)
(for [item (xml-seq (parse (. url openStream)))
:when (and
(= :a (:tag item))
(not= "javascript:"
(apply str (take 11 (:href (:attrs item))))))]
(:href (:attrs item)))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment