takuma-saito/traversable.rb

## traversable.rb
require 'uri'
require 'open-uri'
require 'nokogiri'

module Traversable
  Target = Struct.new(:url, :depth, keyword_init: true)
  def traverse(root, max_depth = 2)
    root_url = URI.parse(root)
    host = root_url.host
    targets = [Target.new(url: root_url, depth: 0)]
    while target = targets.shift
      begin
        break if target.depth >= max_depth
        wait = rand
        sleep wait
        content = fetch(target.url)
        save target.url, content, wait
        ts = extract_urls(content)
               .map {|u| URI.parse(u).yield_self {|u| u.relative? ? target.url.merge(u) : u}}
               .select {|child_url| child_url.host == host }
               .map {|u| Target.new(url: u, depth: target.depth + 1)}
        targets += ts
      rescue URI::InvalidURIError, OpenURI::HTTPError, Net::OpenTimeout
        next
      rescue SocketError, OpenSSL::SSL::SSLError, Errno::ECONNREFUSED
        break
      end
    end
    puts "done: #{root_url}"
  end
  def fetch(link)
    link.fragment = ''
    URI.open(link)
  end
  def save(url, content, wait)
    puts "%s: %s, %2.2f" % [url.to_s, content.size, wait]
  end
end

class Crawler
  include Traversable
  def self.run(*args)
    new(*args)
  end
  def initialize(*args)
    traverse(*args)
  end
  def extract_urls(content)
    Nokogiri::HTML(content).xpath('//a[not(starts-with(@href, "mailto"))]')
      .map {|node| node[:href]&.strip}.reject(&:nil?)
  end
end

URL = ['https://docs.ruby-lang.org/ja/latest/method/URI/s/parse.html',
       'http://www.glivec.jp/glivecnavi/gist/about.html',
       'https://stackoverflow.com/questions/11267266/finding-e-mail-addresses-using-xpath',
       'https://www.crccheck.com/blog/xpath-to-find-email-address-links/']

File.read('domains.txt').split("\n").map do |domain|
  Thread.new do
    Crawler.run("https://#{domain}/")
  end
end.map(&:join)
	require 'uri'
	require 'open-uri'
	require 'nokogiri'

	module Traversable
	Target = Struct.new(:url, :depth, keyword_init: true)
	def traverse(root, max_depth = 2)
	root_url = URI.parse(root)
	host = root_url.host
	targets = [Target.new(url: root_url, depth: 0)]
	while target = targets.shift
	begin
	break if target.depth >= max_depth
	wait = rand
	sleep wait
	content = fetch(target.url)
	save target.url, content, wait
	ts = extract_urls(content)
	.map {\|u\| URI.parse(u).yield_self {\|u\| u.relative? ? target.url.merge(u) : u}}
	.select {\|child_url\| child_url.host == host }
	.map {\|u\| Target.new(url: u, depth: target.depth + 1)}
	targets += ts
	rescue URI::InvalidURIError, OpenURI::HTTPError, Net::OpenTimeout
	next
	rescue SocketError, OpenSSL::SSL::SSLError, Errno::ECONNREFUSED
	break
	end
	end
	puts "done: #{root_url}"
	end
	def fetch(link)
	link.fragment = ''
	URI.open(link)
	end
	def save(url, content, wait)
	puts "%s: %s, %2.2f" % [url.to_s, content.size, wait]
	end
	end

	class Crawler
	include Traversable
	def self.run(*args)
	new(*args)
	end
	def initialize(*args)
	traverse(*args)
	end
	def extract_urls(content)
	Nokogiri::HTML(content).xpath('//a[not(starts-with(@href, "mailto"))]')
	.map {\|node\| node[:href]&.strip}.reject(&:nil?)
	end
	end

	URL = ['https://docs.ruby-lang.org/ja/latest/method/URI/s/parse.html',
	'http://www.glivec.jp/glivecnavi/gist/about.html',
	'https://stackoverflow.com/questions/11267266/finding-e-mail-addresses-using-xpath',
	'https://www.crccheck.com/blog/xpath-to-find-email-address-links/']

	File.read('domains.txt').split("\n").map do \|domain\|
	Thread.new do
	Crawler.run("https://#{domain}/")
	end
	end.map(&:join)