Created
November 30, 2008 18:29
-
-
Save weepy/30497 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'hpricot' | |
require 'open-uri' | |
class Spiderz | |
#root should be like http://www.google.com (i.e. with http://) | |
def initialize(root) | |
@followed = {} | |
@root = root | |
end | |
def crawl(url) | |
started(url) | |
@to_follow = [url] | |
spider_loop | |
completed | |
end | |
def spider_loop | |
while(@to_follow.length > 0) | |
new_links = page_links(@to_follow.shift) | |
@to_follow += new_links | |
end | |
end | |
def external? href | |
href.match("[a-z]+://") && !href.match(@root) | |
end | |
def bookmark? href | |
href.match(/^#/) | |
end | |
def mail? href | |
href.match("mailto") | |
end | |
def skip?(href) | |
!href || ( @followed[href] || external?(href) || mail?(href) || bookmark?(href)) | |
end | |
def started url | |
puts "Started crawling from #{url}" | |
end | |
def completed | |
puts "Completed crawling" | |
end | |
def failed url | |
puts "Failed to open: #{url}" | |
end | |
def succeded url, doc | |
puts "Successfully opened: #{url}" | |
end | |
def page_links url | |
#puts url | |
return [] if @followed[url] | |
@followed[url] = true | |
begin | |
doc = Hpricot(open(@root+url)) | |
rescue | |
failed(url) | |
return [] | |
end | |
succeded(url, doc) | |
links = doc/"a" #find links | |
urls = links.map do |a| | |
a.attributes["href"] | |
end | |
urls.delete_if do |url| | |
skip? url | |
end | |
urls | |
end | |
end | |
s = Spiderz.new "http://www.orbitsound.co.uk" | |
s.crawl("/") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment