jarsen/crawler.rb

## crawler.rb
#!/usr/local/bin/ruby

# crawler.rb
# by: Jason Larsen
# a generic web crawler that allows the user to do whatever they want by passing blocks
# @version 0.7
# 14 Dec 2009
# 0.6 things seem to be working well
# 0.7 modified so that URL's being added to the queue truncate fragments,
# this should save a lot of work

require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'open-uri'
require 'fileutils'

class Crawler
    attr_accessor :verbose, :mode
    attr_reader :visited, :errors, :base_url, :time

    def initialize url
        @base_url = URI::parse url
        @base_url.normalize!
        @visited, @errors, @queue, @pages = [], {}, [@base_url], 0
    end

    def crawl
        start_time = Time::now
        until @queue.empty? do
			# FIXME remove when fixed weird error
			puts
			puts "Queue Before:"
            puts @queue.size < 4 ? @queue : @queue.size
            puts "Visited size: #{@visited.size}"

            url = @queue.shift
            puts "Crawling #{url}" if verbose

            begin
                src = Nokogiri::HTML::parse(open url.to_s)
            rescue Exception => e
                puts e if verbose
                if errors[e.to_s] then
                   errors[e.to_s] += 1
                else
                    errors[e.to_s] = 0
                end
				next # try the next url
            end

            for link in src.search('a')
                begin
                    href = URI::parse link['href']
                    href.normalize!
                    href = url + href if href.relative?
                rescue Exception
                    next
                end

                # skip any non-http links e.g mailto, etc.
                next if not href.scheme == 'http'
                # add it to the queue if we need to
                enqueue href

				if scope? href then
					if href.path.end_with? "/" then
						link['href'] += "index.html"
					else
						path = href.path[0.. href.path.rindex('/')]
						path = href.path + "index.html" if href.path.end_with? "/"
						link['href'] = File.basename(href.path).gsub(/[.].*/,'.html')
					end
				end

            end

            just_visited url

            # write to file
			# figure out where we're writing it
			url.path = url.path + "index.html" if url.path.end_with? '/'
			file_name = File.basename(url.path).gsub(/[.].*/,'') + '.html'
			dirs = File.dirname(url.host + url.path)
			# make the directories
			FileUtils.makedirs(dirs) unless File.directory?(dirs)
			file = File.open(dirs + '/' + file_name, 'w')
			file.puts src.inner_html
			file.close
			puts "Saved as #{file.path}"

            # FIXME remove when fixed weird error
			puts "Queue After:"
            puts @queue.size < 4 ? @queue : @queue.size
            puts "Visited size: #{@visited.size}"
        end
        @time = Time::now - start_time
    end

    def report
        error_report = ""
        @errors.each { |error,count| error_report += "\t#{error}: #{count}\n" }
        "Pages cached: #{@pages}\nTime elapsed: #{@time} seconds\nErrors:\n" + error_report
    end

    private

    def enqueue url
        url.fragment = nil # by getting rid of the fragments we save a lot of work
        if scope? url and not visited? url and not queued? url then
			if @queue.size < 4 then
				puts "#{url} is in scope" if scope? url
				puts "#{url} is not visited" if not visited? url
				puts "#{url} is not queued" if not queued? url
			end
			puts "...Queuing #{url}..." if verbose
			@queue.push url
        end
    end

    def just_visited url
        @pages += 1
        @visited.push url
		puts "visited #{url}" if @queue.size < 4 and visited? url
    end

	def scope? url
        return false if not url.scheme == @base_url.scheme and not url.host == @base_url.host
        url.path.start_with? @base_url.path[0..@base_url.path.rindex('/')]
    end

	def visited? url
		@visited.include? url
	end

	def queued? url
		@queue.include? url
	end
end

crawler = Crawler.new 'http://saas.byu.edu/catalog/2005-2006ucat/'
crawler.verbose = true
crawler.crawl
puts crawler.report
	#!/usr/local/bin/ruby

	# crawler.rb
	# by: Jason Larsen
	# a generic web crawler that allows the user to do whatever they want by passing blocks
	# @version 0.7
	# 14 Dec 2009
	# 0.6 things seem to be working well
	# 0.7 modified so that URL's being added to the queue truncate fragments,
	# this should save a lot of work

	require 'rubygems'
	require 'net/http'
	require 'nokogiri'
	require 'open-uri'
	require 'fileutils'

	class Crawler
	attr_accessor :verbose, :mode
	attr_reader :visited, :errors, :base_url, :time

	def initialize url
	@base_url = URI::parse url
	@base_url.normalize!
	@visited, @errors, @queue, @pages = [], {}, [@base_url], 0
	end

	def crawl
	start_time = Time::now
	until @queue.empty? do
	# FIXME remove when fixed weird error
	puts
	puts "Queue Before:"
	puts @queue.size < 4 ? @queue : @queue.size
	puts "Visited size: #{@visited.size}"

	url = @queue.shift
	puts "Crawling #{url}" if verbose

	begin
	src = Nokogiri::HTML::parse(open url.to_s)
	rescue Exception => e
	puts e if verbose
	if errors[e.to_s] then
	errors[e.to_s] += 1
	else
	errors[e.to_s] = 0
	end
	next # try the next url
	end

	for link in src.search('a')
	begin
	href = URI::parse link['href']
	href.normalize!
	href = url + href if href.relative?
	rescue Exception
	next
	end

	# skip any non-http links e.g mailto, etc.
	next if not href.scheme == 'http'
	# add it to the queue if we need to
	enqueue href

	if scope? href then
	if href.path.end_with? "/" then
	link['href'] += "index.html"
	else
	path = href.path[0.. href.path.rindex('/')]
	path = href.path + "index.html" if href.path.end_with? "/"
	link['href'] = File.basename(href.path).gsub(/[.].*/,'.html')
	end
	end

	end

	just_visited url

	# write to file
	# figure out where we're writing it
	url.path = url.path + "index.html" if url.path.end_with? '/'
	file_name = File.basename(url.path).gsub(/[.].*/,'') + '.html'
	dirs = File.dirname(url.host + url.path)
	# make the directories
	FileUtils.makedirs(dirs) unless File.directory?(dirs)
	file = File.open(dirs + '/' + file_name, 'w')
	file.puts src.inner_html
	file.close
	puts "Saved as #{file.path}"

	# FIXME remove when fixed weird error
	puts "Queue After:"
	puts @queue.size < 4 ? @queue : @queue.size
	puts "Visited size: #{@visited.size}"
	end
	@time = Time::now - start_time
	end

	def report
	error_report = ""
	@errors.each { \|error,count\| error_report += "\t#{error}: #{count}\n" }
	"Pages cached: #{@pages}\nTime elapsed: #{@time} seconds\nErrors:\n" + error_report
	end

	private

	def enqueue url
	url.fragment = nil # by getting rid of the fragments we save a lot of work
	if scope? url and not visited? url and not queued? url then
	if @queue.size < 4 then
	puts "#{url} is in scope" if scope? url
	puts "#{url} is not visited" if not visited? url
	puts "#{url} is not queued" if not queued? url
	end
	puts "...Queuing #{url}..." if verbose
	@queue.push url
	end
	end

	def just_visited url
	@pages += 1
	@visited.push url
	puts "visited #{url}" if @queue.size < 4 and visited? url
	end

	def scope? url
	return false if not url.scheme == @base_url.scheme and not url.host == @base_url.host
	url.path.start_with? @base_url.path[0..@base_url.path.rindex('/')]
	end

	def visited? url
	@visited.include? url
	end

	def queued? url
	@queue.include? url
	end
	end

	crawler = Crawler.new 'http://saas.byu.edu/catalog/2005-2006ucat/'
	crawler.verbose = true
	crawler.crawl
	puts crawler.report