petehamilton/simple_crawler.rb

## simple_crawler.rb
# GEMFILE
# source 'https://rubygems.org'
#
# gem 'nokogiri'
# gem 'awesome_print'

#!/usr/bin/env ruby

require 'open-uri'
require 'optparse'
require 'monitor'

require 'nokogiri'
require 'awesome_print'

# The program takes a domain/site and crawls it.
# It then outputs the resulting hash to the screen.
#
# Author::    Peter Hamilton  (mailto:peter@inspiredpixel.net)

# This class is what's used to crawl a site
# It stores a hash @pages of url => links and assets(images/js/css) which can be
# retrieved using get_pages
#
# By giving it a number of threads > 1, you can speed up the processing considerably.
# For example:
# $ time ./simple_crawler.rb https://gocardless.com -t 1  => 1.94s user 0.13s system 9% cpu 21.552 total
# $ time ./simple_crawler.rb https://gocardless.com -t 2  => 1.82s user 0.11s system 15% cpu 12.627 total
# $ time ./simple_crawler.rb https://gocardless.com -t 3  => 1.79s user 0.11s system 19% cpu 9.599 total
# $ time ./simple_crawler.rb https://gocardless.com -t 5  => 1.75s user 0.11s system 28% cpu 6.570 total
# $ time ./simple_crawler.rb https://gocardless.com -t 10  => 1.69s user 0.10s system 38% cpu 4.688 total

class SimpleCrawler

  # Set up the crawler
  # Takes a root url e.g https://gocardless.com and an options hash
  #
  # Options:
  # - verbose - show all urls as they're processed
  # - thread_count - Use a certain size thread pool
  def initialize(root_url, options = {})
    @verbose = false || options[:verbose]

    @thread_count = options[:thread_count]
    @thread_count ||= 1

    @root_url = ""
    @root_url = process_url root_url
  end

  # Perform the site crawl
  # 1. Creates a queue of urls to crawl (starting with the root url)
  # 2. Create a thread pool (using size thread_count, defined when created)
  # 3. While queue not empty, threads will process URLs
  def crawl
    puts "Crawling #{@root_url}" if @verbose
    @pages = {}
    @crawl_queue = Queue.new
    @crawl_queue << "#{@root_url}"

    @crawl_queue.extend MonitorMixin
    crawl_queue_cond = @crawl_queue.new_cond

    threads = []
    active_threads = 0
    crawl_complete = false


    @thread_count.times do |i|

      # Register/count each active thread
      @crawl_queue.synchronize do
        active_threads += 1
      end

      resources = nil
      url = nil

      threads << Thread.new do
        loop do
          # Synchronize on critical code which adds to the pages and queue
          @crawl_queue.synchronize do
            unless resources.nil?
              update_pages_and_queue(url, resources)
              print_status(url)
            else
              # URL Error, skip. Could add future functionality for n-retries?
              @pages.delete url
            end

            # 1. If empty queue + no other threads running implies that we've
            #    completed the site crawl. Can be modified by all threads
            # 2. Wake up other threads which will either process more urls or
            #    exit depending on 'crawl_complete' and queue state
            # 3. Wait until queue is not empty or crawling is marked as complete
            # 4. Thread has woken up, exit if we're done crawling
            # 5. If not done, bump active thread count and re-enter loop
            crawl_complete = true if @crawl_queue.empty? and active_threads == 1
            crawl_queue_cond.broadcast unless @crawl_queue.empty? and !crawl_complete
            active_threads -= 1
            crawl_queue_cond.wait_while { @crawl_queue.empty? and !crawl_complete }
            Thread.exit if crawl_complete
            active_threads += 1

            url = @crawl_queue.shift
          end

          resources = crawl_url url
        end
      end
    end

    threads.each { |t| t.join }
  end

  # Get the pages hash. Each entry contains a hash for the links and assets
  def get_pages
    @pages
  end

  private

  # Retrieves HTML for the given url, extract all links and assets and return in a hash
  def crawl_url(url)
    begin
      html = Nokogiri::HTML(open(url).read)
    rescue Exception => e
      puts "Error reading #{url} :: #{e}" if @verbose
      return nil
    end

    links   = html.css('a').map { |link| process_url link['href'] }.compact
    assets  = html.css('link').map { |link| process_url link['href'] }.compact
    assets += html.css('img').map { |link| process_url link['src'] }.compact
    assets += html.css('script').map { |link| process_url link['src'] }.compact

    return {links: links.uniq, assets: assets.uniq}
  end

  # Given a url, clean it up
  # - Remove any hash urls or query parameters
  # - Discard external links, mailtos, tels, javascript triggers
  # - Ensure returned URL is absolute
  def process_url(url)
    return nil if url.nil? or url.empty?
    url = url.gsub(/[#|?].*/, '') # Clear hashrefs and query params
    url = url.gsub(/\/$/, '') # Remove trailing slashes

    bad_matches = [
      /^(http(?!#{Regexp.escape @root_url.gsub("http","")})|\/\/)/, # Discard external links
      /^mailto/, # Discard mailto links
      /^tel/, # Discard telephone
      /^javascript/ # Discard javascript triggers
    ]

    # Case slightly more open to extension
    case url
      when *bad_matches
        return nil
      else
        return URI.join(@root_url, url).to_s
    end
  end

  # Output the current completions/total_queued to the console
  # Defaults to single-line-update but verbose (-v) mode triggers full output
  def print_status(url)
    done = @pages.values.compact.length.to_s.rjust(2, '0')
    total = @pages.length.to_s.rjust(2, '0')
    print "\r#{" "*80}\r" unless @verbose
    print "Crawled #{done}/#{total}: #{url}"
    print "\n" if @verbose
    STDOUT.flush
  end

  # Sets the page resources for the given URL and adds any new links
  # to the crawl queue
  def update_pages_and_queue(url, resources)
    @pages[url] = resources
    resources[:links].each do |link|
      unless @pages.has_key? link
        @crawl_queue.enq(link)
        @pages[link] = nil
      end
    end
  end
end

# Gather Command Line Options
options = {}
options[:verbose] = false

opt_parser = OptionParser.new do |opt|
  opt.banner = "Usage: simple_crawler URL [OPTIONS]"
  opt.separator  ""
  opt.separator  "Options"

  opt.on("-t n","--thread-count=n", OptionParser::DecimalInteger, "Process using a thread pool of size n") do |thread_count|
    options[:thread_count] = thread_count
  end

  opt.on("-v","--verbose","show all urls processed") do
    options[:verbose] = true
  end

  opt.on("-h","--help","help (show this)") do
    puts opt_parser
    exit
  end
end

# Run crawler
opt_parser.parse!

# Require domain
if ARGV.count < 1
  puts opt_parser
  exit
end

# Crawl domain URL
domain_url = ARGV[0]
c = SimpleCrawler.new(domain_url, options)
c.crawl

# Print pages hash
ap c.get_pages
	# GEMFILE
	# source 'https://rubygems.org'
	#
	# gem 'nokogiri'
	# gem 'awesome_print'

	#!/usr/bin/env ruby

	require 'open-uri'
	require 'optparse'
	require 'monitor'

	require 'nokogiri'
	require 'awesome_print'

	# The program takes a domain/site and crawls it.
	# It then outputs the resulting hash to the screen.
	#
	# Author:: Peter Hamilton (mailto:peter@inspiredpixel.net)

	# This class is what's used to crawl a site
	# It stores a hash @pages of url => links and assets(images/js/css) which can be
	# retrieved using get_pages
	#
	# By giving it a number of threads > 1, you can speed up the processing considerably.
	# For example:
	# $ time ./simple_crawler.rb https://gocardless.com -t 1 => 1.94s user 0.13s system 9% cpu 21.552 total
	# $ time ./simple_crawler.rb https://gocardless.com -t 2 => 1.82s user 0.11s system 15% cpu 12.627 total
	# $ time ./simple_crawler.rb https://gocardless.com -t 3 => 1.79s user 0.11s system 19% cpu 9.599 total
	# $ time ./simple_crawler.rb https://gocardless.com -t 5 => 1.75s user 0.11s system 28% cpu 6.570 total
	# $ time ./simple_crawler.rb https://gocardless.com -t 10 => 1.69s user 0.10s system 38% cpu 4.688 total

	class SimpleCrawler

	# Set up the crawler
	# Takes a root url e.g https://gocardless.com and an options hash
	#
	# Options:
	# - verbose - show all urls as they're processed
	# - thread_count - Use a certain size thread pool
	def initialize(root_url, options = {})
	@verbose = false \|\| options[:verbose]

	@thread_count = options[:thread_count]
	@thread_count \|\|= 1

	@root_url = ""
	@root_url = process_url root_url
	end

	# Perform the site crawl
	# 1. Creates a queue of urls to crawl (starting with the root url)
	# 2. Create a thread pool (using size thread_count, defined when created)
	# 3. While queue not empty, threads will process URLs
	def crawl
	puts "Crawling #{@root_url}" if @verbose
	@pages = {}
	@crawl_queue = Queue.new
	@crawl_queue << "#{@root_url}"

	@crawl_queue.extend MonitorMixin
	crawl_queue_cond = @crawl_queue.new_cond

	threads = []
	active_threads = 0
	crawl_complete = false


	@thread_count.times do \|i\|

	# Register/count each active thread
	@crawl_queue.synchronize do
	active_threads += 1
	end

	resources = nil
	url = nil

	threads << Thread.new do
	loop do
	# Synchronize on critical code which adds to the pages and queue
	@crawl_queue.synchronize do
	unless resources.nil?
	update_pages_and_queue(url, resources)
	print_status(url)
	else
	# URL Error, skip. Could add future functionality for n-retries?
	@pages.delete url
	end

	# 1. If empty queue + no other threads running implies that we've
	# completed the site crawl. Can be modified by all threads
	# 2. Wake up other threads which will either process more urls or
	# exit depending on 'crawl_complete' and queue state
	# 3. Wait until queue is not empty or crawling is marked as complete
	# 4. Thread has woken up, exit if we're done crawling
	# 5. If not done, bump active thread count and re-enter loop
	crawl_complete = true if @crawl_queue.empty? and active_threads == 1
	crawl_queue_cond.broadcast unless @crawl_queue.empty? and !crawl_complete
	active_threads -= 1
	crawl_queue_cond.wait_while { @crawl_queue.empty? and !crawl_complete }
	Thread.exit if crawl_complete
	active_threads += 1

	url = @crawl_queue.shift
	end

	resources = crawl_url url
	end
	end
	end

	threads.each { \|t\| t.join }
	end

	# Get the pages hash. Each entry contains a hash for the links and assets
	def get_pages
	@pages
	end

	private

	# Retrieves HTML for the given url, extract all links and assets and return in a hash
	def crawl_url(url)
	begin
	html = Nokogiri::HTML(open(url).read)
	rescue Exception => e
	puts "Error reading #{url} :: #{e}" if @verbose
	return nil
	end

	links = html.css('a').map { \|link\| process_url link['href'] }.compact
	assets = html.css('link').map { \|link\| process_url link['href'] }.compact
	assets += html.css('img').map { \|link\| process_url link['src'] }.compact
	assets += html.css('script').map { \|link\| process_url link['src'] }.compact

	return {links: links.uniq, assets: assets.uniq}
	end

	# Given a url, clean it up
	# - Remove any hash urls or query parameters
	# - Discard external links, mailtos, tels, javascript triggers
	# - Ensure returned URL is absolute
	def process_url(url)
	return nil if url.nil? or url.empty?
	url = url.gsub(/[#\|?].*/, '') # Clear hashrefs and query params
	url = url.gsub(/\/$/, '') # Remove trailing slashes

	bad_matches = [
	/^(http(?!#{Regexp.escape @root_url.gsub("http","")})\|\/\/)/, # Discard external links
	/^mailto/, # Discard mailto links
	/^tel/, # Discard telephone
	/^javascript/ # Discard javascript triggers
	]

	# Case slightly more open to extension
	case url
	when *bad_matches
	return nil
	else
	return URI.join(@root_url, url).to_s
	end
	end

	# Output the current completions/total_queued to the console
	# Defaults to single-line-update but verbose (-v) mode triggers full output
	def print_status(url)
	done = @pages.values.compact.length.to_s.rjust(2, '0')
	total = @pages.length.to_s.rjust(2, '0')
	print "\r#{" "*80}\r" unless @verbose
	print "Crawled #{done}/#{total}: #{url}"
	print "\n" if @verbose
	STDOUT.flush
	end

	# Sets the page resources for the given URL and adds any new links
	# to the crawl queue
	def update_pages_and_queue(url, resources)
	@pages[url] = resources
	resources[:links].each do \|link\|
	unless @pages.has_key? link
	@crawl_queue.enq(link)
	@pages[link] = nil
	end
	end
	end
	end

	# Gather Command Line Options
	options = {}
	options[:verbose] = false

	opt_parser = OptionParser.new do \|opt\|
	opt.banner = "Usage: simple_crawler URL [OPTIONS]"
	opt.separator ""
	opt.separator "Options"

	opt.on("-t n","--thread-count=n", OptionParser::DecimalInteger, "Process using a thread pool of size n") do \|thread_count\|
	options[:thread_count] = thread_count
	end

	opt.on("-v","--verbose","show all urls processed") do
	options[:verbose] = true
	end

	opt.on("-h","--help","help (show this)") do
	puts opt_parser
	exit
	end
	end

	# Run crawler
	opt_parser.parse!

	# Require domain
	if ARGV.count < 1
	puts opt_parser
	exit
	end

	# Crawl domain URL
	domain_url = ARGV[0]
	c = SimpleCrawler.new(domain_url, options)
	c.crawl

	# Print pages hash
	ap c.get_pages