Created
April 24, 2018 05:42
-
-
Save hrs/d246006a7be5c7175b24ec26c411892f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "nokogiri" | |
require "open-uri" | |
require "optparse" | |
require "set" | |
MAX_DEPTH = 2 | |
class Page | |
def initialize(body:, url:) | |
@body = body | |
@url = url | |
end | |
def links | |
http_urls + canonicalized_relative_urls | |
end | |
private | |
attr_reader :body, :url | |
def http_urls | |
anchor_hrefs.select { |url| url =~ /^https?:\/\/.*/ } | |
end | |
def canonicalized_relative_urls | |
relative_urls.map { |path| | |
"#{ scheme }://#{ host }#{ path }" | |
} | |
end | |
def relative_urls | |
anchor_hrefs.select { |url| url[0] == "/" } | |
end | |
def anchor_hrefs | |
doc.css("a").map { |anchor| anchor["href"] }.compact | |
end | |
def host | |
URI.parse(url).host | |
end | |
def scheme | |
URI.parse(url).scheme | |
end | |
def doc | |
@_doc ||= Nokogiri::HTML(body) | |
end | |
end | |
class Scraper | |
def initialize(initial_urls, filter: "") | |
@filter = filter | |
@initial_urls = initial_urls | |
end | |
def links(max_depth: MAX_DEPTH) | |
horizon = Set.new(initial_urls) | |
visited_links = Set.new | |
max_depth.times do | |
new_links = Set.new(horizon.flat_map { |url| links_for(url) }) | |
horizon = new_links - visited_links | |
visited_links = visited_links + new_links | |
end | |
visited_links.to_a | |
end | |
private | |
attr_reader :filter, :initial_urls | |
def links_for(url) | |
page = Page.new( | |
body: open(url), | |
url: url, | |
) | |
page.links.select { |link| link.include?(filter) } | |
rescue RuntimeError | |
[] | |
end | |
end | |
filter = "" | |
max_depth = MAX_DEPTH | |
OptionParser.new do |opts| | |
opts.banner = "Usage: #{$0}" | |
opts.on("-h", "--help", "Prints this help information") do | |
puts opts | |
exit | |
end | |
opts.on( | |
"-f", "--filter FILTER", | |
"Only follow links matching FILTER", | |
) do |defined_filter| | |
filter = defined_filter | |
end | |
opts.on( | |
"-d", "--depth DEPTH", | |
"Follow links to a maximum depth of DEPTH (defaults to #{ MAX_DEPTH })", | |
) do |depth| | |
max_depth = depth.to_i | |
end | |
end.parse! | |
initial_urls = ARGF.readlines.map(&:strip) | |
scraper = Scraper.new(initial_urls, filter: filter) | |
puts scraper.links(max_depth: max_depth) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First,
To run it,
Note that
--filter
is just checking URLs for textual inclusion (it's not only checking domains, and it's not a regex).