public
Last active

Script to Crawl a Site for Links/Assets

  • Download Gist
simple_crawler.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
# GEMFILE
# source 'https://rubygems.org'
#
# gem 'nokogiri'
# gem 'awesome_print'
 
#!/usr/bin/env ruby
 
require 'open-uri'
require 'optparse'
require 'monitor'
 
require 'nokogiri'
require 'awesome_print'
 
# The program takes a domain/site and crawls it.
# It then outputs the resulting hash to the screen.
#
# Author:: Peter Hamilton (mailto:peter@inspiredpixel.net)
 
# This class is what's used to crawl a site
# It stores a hash @pages of url => links and assets(images/js/css) which can be
# retrieved using get_pages
#
# By giving it a number of threads > 1, you can speed up the processing considerably.
# For example:
# $ time ./simple_crawler.rb https://gocardless.com -t 1 => 1.94s user 0.13s system 9% cpu 21.552 total
# $ time ./simple_crawler.rb https://gocardless.com -t 2 => 1.82s user 0.11s system 15% cpu 12.627 total
# $ time ./simple_crawler.rb https://gocardless.com -t 3 => 1.79s user 0.11s system 19% cpu 9.599 total
# $ time ./simple_crawler.rb https://gocardless.com -t 5 => 1.75s user 0.11s system 28% cpu 6.570 total
# $ time ./simple_crawler.rb https://gocardless.com -t 10 => 1.69s user 0.10s system 38% cpu 4.688 total
 
class SimpleCrawler
 
# Set up the crawler
# Takes a root url e.g https://gocardless.com and an options hash
#
# Options:
# - verbose - show all urls as they're processed
# - thread_count - Use a certain size thread pool
def initialize(root_url, options = {})
@verbose = false || options[:verbose]
 
@thread_count = options[:thread_count]
@thread_count ||= 1
 
@root_url = ""
@root_url = process_url root_url
end
 
# Perform the site crawl
# 1. Creates a queue of urls to crawl (starting with the root url)
# 2. Create a thread pool (using size thread_count, defined when created)
# 3. While queue not empty, threads will process URLs
def crawl
puts "Crawling #{@root_url}" if @verbose
@pages = {}
@crawl_queue = Queue.new
@crawl_queue << "#{@root_url}"
 
@crawl_queue.extend MonitorMixin
crawl_queue_cond = @crawl_queue.new_cond
 
threads = []
active_threads = 0
crawl_complete = false
 
 
@thread_count.times do |i|
 
# Register/count each active thread
@crawl_queue.synchronize do
active_threads += 1
end
 
resources = nil
url = nil
 
threads << Thread.new do
loop do
# Synchronize on critical code which adds to the pages and queue
@crawl_queue.synchronize do
unless resources.nil?
update_pages_and_queue(url, resources)
print_status(url)
else
# URL Error, skip. Could add future functionality for n-retries?
@pages.delete url
end
 
# 1. If empty queue + no other threads running implies that we've
# completed the site crawl. Can be modified by all threads
# 2. Wake up other threads which will either process more urls or
# exit depending on 'crawl_complete' and queue state
# 3. Wait until queue is not empty or crawling is marked as complete
# 4. Thread has woken up, exit if we're done crawling
# 5. If not done, bump active thread count and re-enter loop
crawl_complete = true if @crawl_queue.empty? and active_threads == 1
crawl_queue_cond.broadcast unless @crawl_queue.empty? and !crawl_complete
active_threads -= 1
crawl_queue_cond.wait_while { @crawl_queue.empty? and !crawl_complete }
Thread.exit if crawl_complete
active_threads += 1
 
url = @crawl_queue.shift
end
 
resources = crawl_url url
end
end
end
 
threads.each { |t| t.join }
end
 
# Get the pages hash. Each entry contains a hash for the links and assets
def get_pages
@pages
end
 
private
 
# Retrieves HTML for the given url, extract all links and assets and return in a hash
def crawl_url(url)
begin
html = Nokogiri::HTML(open(url).read)
rescue Exception => e
puts "Error reading #{url} :: #{e}" if @verbose
return nil
end
 
links = html.css('a').map { |link| process_url link['href'] }.compact
assets = html.css('link').map { |link| process_url link['href'] }.compact
assets += html.css('img').map { |link| process_url link['src'] }.compact
assets += html.css('script').map { |link| process_url link['src'] }.compact
 
return {links: links.uniq, assets: assets.uniq}
end
 
# Given a url, clean it up
# - Remove any hash urls or query parameters
# - Discard external links, mailtos, tels, javascript triggers
# - Ensure returned URL is absolute
def process_url(url)
return nil if url.nil? or url.empty?
url = url.gsub(/[#|?].*/, '') # Clear hashrefs and query params
url = url.gsub(/\/$/, '') # Remove trailing slashes
 
bad_matches = [
/^(http(?!#{Regexp.escape @root_url.gsub("http","")})|\/\/)/, # Discard external links
/^mailto/, # Discard mailto links
/^tel/, # Discard telephone
/^javascript/ # Discard javascript triggers
]
 
# Case slightly more open to extension
case url
when *bad_matches
return nil
else
return URI.join(@root_url, url).to_s
end
end
# Output the current completions/total_queued to the console
# Defaults to single-line-update but verbose (-v) mode triggers full output
def print_status(url)
done = @pages.values.compact.length.to_s.rjust(2, '0')
total = @pages.length.to_s.rjust(2, '0')
print "\r#{" "*80}\r" unless @verbose
print "Crawled #{done}/#{total}: #{url}"
print "\n" if @verbose
STDOUT.flush
end
 
# Sets the page resources for the given URL and adds any new links
# to the crawl queue
def update_pages_and_queue(url, resources)
@pages[url] = resources
resources[:links].each do |link|
unless @pages.has_key? link
@crawl_queue.enq(link)
@pages[link] = nil
end
end
end
end
 
# Gather Command Line Options
options = {}
options[:verbose] = false
 
opt_parser = OptionParser.new do |opt|
opt.banner = "Usage: simple_crawler URL [OPTIONS]"
opt.separator ""
opt.separator "Options"
 
opt.on("-t n","--thread-count=n", OptionParser::DecimalInteger, "Process using a thread pool of size n") do |thread_count|
options[:thread_count] = thread_count
end
 
opt.on("-v","--verbose","show all urls processed") do
options[:verbose] = true
end
 
opt.on("-h","--help","help (show this)") do
puts opt_parser
exit
end
end
 
# Run crawler
opt_parser.parse!
 
# Require domain
if ARGV.count < 1
puts opt_parser
exit
end
 
# Crawl domain URL
domain_url = ARGV[0]
c = SimpleCrawler.new(domain_url, options)
c.crawl
 
# Print pages hash
ap c.get_pages

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.