Skip to content

Instantly share code, notes, and snippets.

@tobinharris
Created January 7, 2010 22:22
Show Gist options
  • Save tobinharris/271647 to your computer and use it in GitHub Desktop.
Save tobinharris/271647 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'pp'
class Spider
def initialize
@max_pages = 50
@counter = 0
@crawled_urls = []
@root_url = nil
@root_node = nil
@queue = []
end
def start(url)
@root_url = url
@queue << [url,nil]
while @queue.length > 0 and @counter < @max_pages
item = @queue.shift
index item[0], item[1]
end
end
def index(url, parent)
puts "Considering #{url}"
#don't do the same page twice
return if @crawled_urls.include? url
#add url so we don't bother again
@crawled_urls << url
# page must contain URL of root page
return unless url.include? @root_url
doc = open(url) { |f| Hpricot(f) }
node = create_node(doc, url, parent)
links_for(doc).each do |link|
@queue << [link, node]
end
puts "Indexed #{url}"
@counter = @counter + 1
#don't hammer server
sleep 1
end
def create_node(doc,url, parent)
node = Node.new
parent.children << node unless parent.nil?
#get interesting stuff
node.url = url
found = doc.search("/html/head/title")
node.title = found[0].inner_html if found.length == 1
node.page_name = url
node.page_name = url.gsub(@root_url,'') unless url == @root_url
#parent is first node
@root_node = node if @root_node.nil?
node
end
def links_for(doc)
links = []
(doc/"a").each do |link|
next unless link.attributes['href']
url = absolute_url(link.attributes['href']).chomp('/')
next if @crawled_urls.include? url
next unless is_crawlable_url(url)
links << url
end
links
end
def is_crawlable_url(url)
return false if url =~ /^\s*mailto\:/
return false if url.include? '#'
true
end
def unique_url(url)
url = url.scan(/([^\#]+)\#.*/)[0] if url =~ /\#/
url
end
def absolute_url(url)
return url if url =~ /^http\:|^https\:/
u=URI.parse(@root_url)
begin
a=u+url
rescue
return "http://badurl.me"
end
a.to_s
end
def root
@root_node
end
end
class Node
attr_accessor :children
attr_accessor :url
attr_accessor :title
attr_accessor :page_name
def initialize
self.children = []
end
end
class Yumlify
def initialize(root)
@data = ""
append(root)
end
def append(node)
@data += "[#{node.url}|#{node.children.length} links]\n"
node.children.each do |child|
@data += "[#{node.url}]->[#{child.url}]\n"
append(child)
end
end
def color(node)
end
def data
@data
end
end
#TODO, write code like the class above to convert to JIT js formatted nodes
class Jitify
end
s = Spider.new
s.start('http://engineroomapps.com')
puts Yumlify.new(s.root).data
#puts Jitify.new(s.root).data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment