Created
December 16, 2009 19:38
-
-
Save jarsen/258107 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
# crawler.rb | |
# by: Jason Larsen | |
# a generic web crawler that allows the user to do whatever they want by passing blocks | |
# @version 0.7 | |
# 14 Dec 2009 | |
# 0.6 things seem to be working well | |
# 0.7 modified so that URL's being added to the queue truncate fragments, | |
# this should save a lot of work | |
require 'rubygems' | |
require 'net/http' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'fileutils' | |
class Crawler | |
attr_accessor :verbose, :mode | |
attr_reader :visited, :errors, :base_url, :time | |
def initialize url | |
@base_url = URI::parse url | |
@base_url.normalize! | |
@visited, @errors, @queue, @pages = [], {}, [@base_url], 0 | |
end | |
def crawl | |
start_time = Time::now | |
until @queue.empty? do | |
# FIXME remove when fixed weird error | |
puts | |
puts "Queue Before:" | |
puts @queue.size < 4 ? @queue : @queue.size | |
puts "Visited size: #{@visited.size}" | |
url = @queue.shift | |
puts "Crawling #{url}" if verbose | |
begin | |
src = Nokogiri::HTML::parse(open url.to_s) | |
rescue Exception => e | |
puts e if verbose | |
if errors[e.to_s] then | |
errors[e.to_s] += 1 | |
else | |
errors[e.to_s] = 0 | |
end | |
next # try the next url | |
end | |
for link in src.search('a') | |
begin | |
href = URI::parse link['href'] | |
href.normalize! | |
href = url + href if href.relative? | |
rescue Exception | |
next | |
end | |
# skip any non-http links e.g mailto, etc. | |
next if not href.scheme == 'http' | |
# add it to the queue if we need to | |
enqueue href | |
if scope? href then | |
if href.path.end_with? "/" then | |
link['href'] += "index.html" | |
else | |
path = href.path[0.. href.path.rindex('/')] | |
path = href.path + "index.html" if href.path.end_with? "/" | |
link['href'] = File.basename(href.path).gsub(/[.].*/,'.html') | |
end | |
end | |
end | |
just_visited url | |
# write to file | |
# figure out where we're writing it | |
url.path = url.path + "index.html" if url.path.end_with? '/' | |
file_name = File.basename(url.path).gsub(/[.].*/,'') + '.html' | |
dirs = File.dirname(url.host + url.path) | |
# make the directories | |
FileUtils.makedirs(dirs) unless File.directory?(dirs) | |
file = File.open(dirs + '/' + file_name, 'w') | |
file.puts src.inner_html | |
file.close | |
puts "Saved as #{file.path}" | |
# FIXME remove when fixed weird error | |
puts "Queue After:" | |
puts @queue.size < 4 ? @queue : @queue.size | |
puts "Visited size: #{@visited.size}" | |
end | |
@time = Time::now - start_time | |
end | |
def report | |
error_report = "" | |
@errors.each { |error,count| error_report += "\t#{error}: #{count}\n" } | |
"Pages cached: #{@pages}\nTime elapsed: #{@time} seconds\nErrors:\n" + error_report | |
end | |
private | |
def enqueue url | |
url.fragment = nil # by getting rid of the fragments we save a lot of work | |
if scope? url and not visited? url and not queued? url then | |
if @queue.size < 4 then | |
puts "#{url} is in scope" if scope? url | |
puts "#{url} is not visited" if not visited? url | |
puts "#{url} is not queued" if not queued? url | |
end | |
puts "...Queuing #{url}..." if verbose | |
@queue.push url | |
end | |
end | |
def just_visited url | |
@pages += 1 | |
@visited.push url | |
puts "visited #{url}" if @queue.size < 4 and visited? url | |
end | |
def scope? url | |
return false if not url.scheme == @base_url.scheme and not url.host == @base_url.host | |
url.path.start_with? @base_url.path[0..@base_url.path.rindex('/')] | |
end | |
def visited? url | |
@visited.include? url | |
end | |
def queued? url | |
@queue.include? url | |
end | |
end | |
crawler = Crawler.new 'http://saas.byu.edu/catalog/2005-2006ucat/' | |
crawler.verbose = true | |
crawler.crawl | |
puts crawler.report |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment