Skip to content

Instantly share code, notes, and snippets.

@metade
Created December 24, 2009 11:25
Show Gist options
  • Save metade/263152 to your computer and use it in GitHub Desktop.
Save metade/263152 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'bbc_standards'
require 'sqlite_cache'
require 'open-uri'
require 'rubigraph'
include Rubigraph
Rubigraph.init
class Crawler
def initialize(site, ignores)
@site = site
@cache = SqliteCache.new('my_cache.db')
@vertices = {}
@ignores = ignores
@traversed = []
end
def crawl(url=@site, indent='')
puts "#{indent}#{url.sub(@site, '')}"
stuff = process_page(url)
make_vertex(url)
@traversed << url
stuff[:site_links].sort.each do |a|
next if @ignores.detect { |i| a =~ i }
make_vertex(a)
Edge.new(@vertices[url], @vertices[a])
end
stuff[:site_links].sort.each do |a|
next if @ignores.detect { |i| a =~ i }
next if @traversed.include? a
crawl(a, indent+' ')
end
end
protected
def make_vertex(url)
unless @vertices.has_key? url
@vertices[url] ||= Vertex.new
@vertices[url].label = url.sub(@site, '')
# @vertices[url].set_attribute('callback_left_doubleclick', url)
end
end
def fetch_page(url)
@cache.do_cached(url) do
puts "fetching: #{url}"
begin
open(url).read
rescue => e
puts "#{e.message}: #{url}"
return ''
end
end
end
def process_page(url)
html = fetch_page(url)
# validator = BBCStandards::Validator.new(html)
doc = Nokogiri::HTML(html)
links = doc.xpath("//a").map do |a|
if a['href'] =~ /^\//
"http://www.bbc.co.uk#{a['href']}"
elsif a['href'] =~ /^#/
nil
else
a['href']
end
end.compact
links.each { |a| a.strip!; a.sub!(/\/$/, ''); a.sub!(/#.*$/, ''); }
links.uniq!
links.delete(url)
site_links = links.select { |a| a =~ /^#{@site}/ }
{ :url => url,
# :errors => validator.errors,
:links => links,
:site_links => site_links
}
end
end
ignores = [
%r[/radio1/photos/],
%r[\?userid=\d+],
%r[schedules/\d+/\d+/\d+],
%r[/radio1/programmes/genres],
%r[tracklistingarchive.shtml\?\d+],
%r[galleries/\d+/\d+],
%r[\?gp=\d+],
%r[tracklisting_\d+],
%r[runningorder_\d+],
%r[\.jpg$],
]
crawler = Crawler.new('http://www.bbc.co.uk/radio1', ignores)
crawler.crawl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment