Skip to content

Instantly share code, notes, and snippets.

@jarsen
Created December 16, 2009 19:38
Show Gist options
  • Save jarsen/258107 to your computer and use it in GitHub Desktop.
Save jarsen/258107 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/ruby
# crawler.rb
# by: Jason Larsen
# a generic web crawler that allows the user to do whatever they want by passing blocks
# @version 0.7
# 14 Dec 2009
# 0.6 things seem to be working well
# 0.7 modified so that URL's being added to the queue truncate fragments,
# this should save a lot of work
require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'open-uri'
require 'fileutils'
class Crawler
attr_accessor :verbose, :mode
attr_reader :visited, :errors, :base_url, :time
def initialize url
@base_url = URI::parse url
@base_url.normalize!
@visited, @errors, @queue, @pages = [], {}, [@base_url], 0
end
def crawl
start_time = Time::now
until @queue.empty? do
# FIXME remove when fixed weird error
puts
puts "Queue Before:"
puts @queue.size < 4 ? @queue : @queue.size
puts "Visited size: #{@visited.size}"
url = @queue.shift
puts "Crawling #{url}" if verbose
begin
src = Nokogiri::HTML::parse(open url.to_s)
rescue Exception => e
puts e if verbose
if errors[e.to_s] then
errors[e.to_s] += 1
else
errors[e.to_s] = 0
end
next # try the next url
end
for link in src.search('a')
begin
href = URI::parse link['href']
href.normalize!
href = url + href if href.relative?
rescue Exception
next
end
# skip any non-http links e.g mailto, etc.
next if not href.scheme == 'http'
# add it to the queue if we need to
enqueue href
if scope? href then
if href.path.end_with? "/" then
link['href'] += "index.html"
else
path = href.path[0.. href.path.rindex('/')]
path = href.path + "index.html" if href.path.end_with? "/"
link['href'] = File.basename(href.path).gsub(/[.].*/,'.html')
end
end
end
just_visited url
# write to file
# figure out where we're writing it
url.path = url.path + "index.html" if url.path.end_with? '/'
file_name = File.basename(url.path).gsub(/[.].*/,'') + '.html'
dirs = File.dirname(url.host + url.path)
# make the directories
FileUtils.makedirs(dirs) unless File.directory?(dirs)
file = File.open(dirs + '/' + file_name, 'w')
file.puts src.inner_html
file.close
puts "Saved as #{file.path}"
# FIXME remove when fixed weird error
puts "Queue After:"
puts @queue.size < 4 ? @queue : @queue.size
puts "Visited size: #{@visited.size}"
end
@time = Time::now - start_time
end
def report
error_report = ""
@errors.each { |error,count| error_report += "\t#{error}: #{count}\n" }
"Pages cached: #{@pages}\nTime elapsed: #{@time} seconds\nErrors:\n" + error_report
end
private
def enqueue url
url.fragment = nil # by getting rid of the fragments we save a lot of work
if scope? url and not visited? url and not queued? url then
if @queue.size < 4 then
puts "#{url} is in scope" if scope? url
puts "#{url} is not visited" if not visited? url
puts "#{url} is not queued" if not queued? url
end
puts "...Queuing #{url}..." if verbose
@queue.push url
end
end
def just_visited url
@pages += 1
@visited.push url
puts "visited #{url}" if @queue.size < 4 and visited? url
end
def scope? url
return false if not url.scheme == @base_url.scheme and not url.host == @base_url.host
url.path.start_with? @base_url.path[0..@base_url.path.rindex('/')]
end
def visited? url
@visited.include? url
end
def queued? url
@queue.include? url
end
end
crawler = Crawler.new 'http://saas.byu.edu/catalog/2005-2006ucat/'
crawler.verbose = true
crawler.crawl
puts crawler.report
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment