hrs/scraper.rb

## scraper.rb
require "nokogiri"
require "open-uri"
require "optparse"
require "set"

MAX_DEPTH = 2

class Page
  def initialize(body:, url:)
    @body = body
    @url = url
  end

  def links
    http_urls + canonicalized_relative_urls
  end

  private

  attr_reader :body, :url

  def http_urls
    anchor_hrefs.select { |url| url =~ /^https?:\/\/.*/ }
  end

  def canonicalized_relative_urls
    relative_urls.map { |path|
      "#{ scheme }://#{ host }#{ path }"
    }
  end

  def relative_urls
    anchor_hrefs.select { |url| url[0] == "/" }
  end

  def anchor_hrefs
    doc.css("a").map { |anchor| anchor["href"] }.compact
  end

  def host
    URI.parse(url).host
  end

  def scheme
    URI.parse(url).scheme
  end

  def doc
    @_doc ||= Nokogiri::HTML(body)
  end
end

class Scraper
  def initialize(initial_urls, filter: "")
    @filter = filter
    @initial_urls = initial_urls
  end

  def links(max_depth: MAX_DEPTH)
    horizon = Set.new(initial_urls)
    visited_links = Set.new

    max_depth.times do
      new_links = Set.new(horizon.flat_map { |url| links_for(url) })
      horizon = new_links - visited_links
      visited_links = visited_links + new_links
    end

    visited_links.to_a
  end

  private

  attr_reader :filter, :initial_urls

  def links_for(url)
    page = Page.new(
      body: open(url),
      url: url,
    )

    page.links.select { |link| link.include?(filter) }
  rescue RuntimeError
    []
  end
end

filter = ""
max_depth = MAX_DEPTH

OptionParser.new do |opts|
  opts.banner = "Usage: #{$0}"

  opts.on("-h", "--help", "Prints this help information") do
    puts opts
    exit
  end

  opts.on(
    "-f", "--filter FILTER",
    "Only follow links matching FILTER",
  ) do |defined_filter|
    filter = defined_filter
  end

  opts.on(
    "-d", "--depth DEPTH",
    "Follow links to a maximum depth of DEPTH (defaults to #{ MAX_DEPTH })",
  ) do |depth|
    max_depth = depth.to_i
  end
end.parse!

initial_urls = ARGF.readlines.map(&:strip)
scraper = Scraper.new(initial_urls, filter: filter)

puts scraper.links(max_depth: max_depth)
	require "nokogiri"
	require "open-uri"
	require "optparse"
	require "set"

	MAX_DEPTH = 2

	class Page
	def initialize(body:, url:)
	@body = body
	@url = url
	end

	def links
	http_urls + canonicalized_relative_urls
	end

	private

	attr_reader :body, :url

	def http_urls
	anchor_hrefs.select { \|url\| url =~ /^https?:\/\/.*/ }
	end

	def canonicalized_relative_urls
	relative_urls.map { \|path\|
	"#{ scheme }://#{ host }#{ path }"
	}
	end

	def relative_urls
	anchor_hrefs.select { \|url\| url[0] == "/" }
	end

	def anchor_hrefs
	doc.css("a").map { \|anchor\| anchor["href"] }.compact
	end

	def host
	URI.parse(url).host
	end

	def scheme
	URI.parse(url).scheme
	end

	def doc
	@_doc \|\|= Nokogiri::HTML(body)
	end
	end

	class Scraper
	def initialize(initial_urls, filter: "")
	@filter = filter
	@initial_urls = initial_urls
	end

	def links(max_depth: MAX_DEPTH)
	horizon = Set.new(initial_urls)
	visited_links = Set.new

	max_depth.times do
	new_links = Set.new(horizon.flat_map { \|url\| links_for(url) })
	horizon = new_links - visited_links
	visited_links = visited_links + new_links
	end

	visited_links.to_a
	end

	private

	attr_reader :filter, :initial_urls

	def links_for(url)
	page = Page.new(
	body: open(url),
	url: url,
	)

	page.links.select { \|link\| link.include?(filter) }
	rescue RuntimeError
	[]
	end
	end

	filter = ""
	max_depth = MAX_DEPTH

	OptionParser.new do \|opts\|
	opts.banner = "Usage: #{$0}"

	opts.on("-h", "--help", "Prints this help information") do
	puts opts
	exit
	end

	opts.on(
	"-f", "--filter FILTER",
	"Only follow links matching FILTER",
	) do \|defined_filter\|
	filter = defined_filter
	end

	opts.on(
	"-d", "--depth DEPTH",
	"Follow links to a maximum depth of DEPTH (defaults to #{ MAX_DEPTH })",
	) do \|depth\|
	max_depth = depth.to_i
	end
	end.parse!

	initial_urls = ARGF.readlines.map(&:strip)
	scraper = Scraper.new(initial_urls, filter: filter)

	puts scraper.links(max_depth: max_depth)