Created
September 15, 2009 19:29
-
-
Save cmiles74/187551 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; | |
;; Provides functions for managing links. | |
;; | |
(ns org.cooleydickinson.crawler.links | |
(:use [org.cooleydickinson.crawler.tagsoup :only (parse)]) | |
(:import (java.net URL))) | |
(defn canonicize | |
"Returns a canonical (not relative) URL for the supplied URL. The | |
information used to canonicize the URL is copied from the supplied | |
URL object." | |
[url url-text] | |
;; make sure the url isn't already cannonical | |
(if (or | |
(= "http://" (apply str (take 7 url-text))) | |
(= "https://" (apply str (take 8 url-text)))) | |
url-text | |
(str | |
(. url getProtocol) | |
"://" | |
(. url getHost) | |
;; only include the port if it's not 80 or 443 | |
(if (and (< 0 (. url getPort)) | |
(or (not= 80 (. url getPort)) | |
(not= 443 (. url getPort)))) | |
(str ":" (. url getPort))) | |
url-text))) | |
(defn links | |
"Returns a list of the unique canonical links that are referenced by | |
the content at the supplied URL." | |
[url-text] | |
;; create a URL for the passed in url-text | |
(let [url (new URL url-text)] | |
;; canonicize all of our URLs and return a unique list | |
(set | |
(map (partial canonicize url) | |
;; return a list of URLs referenced by the URL | |
;; (excluding javascript links) | |
(for [item (xml-seq (parse (. url openStream))) | |
:when (and | |
(= :a (:tag item)) | |
(not= "javascript:" | |
(apply str (take 11 (:href (:attrs item))))))] | |
(:href (:attrs item))))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment