Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
traversable.rb
require 'uri'
require 'open-uri'
require 'nokogiri'
module Traversable
Target = Struct.new(:url, :depth, keyword_init: true)
def traverse(root, max_depth = 2)
root_url = URI.parse(root)
host = root_url.host
targets = [Target.new(url: root_url, depth: 0)]
while target = targets.shift
begin
break if target.depth >= max_depth
wait = rand
sleep wait
content = fetch(target.url)
save target.url, content, wait
ts = extract_urls(content)
.map {|u| URI.parse(u).yield_self {|u| u.relative? ? target.url.merge(u) : u}}
.select {|child_url| child_url.host == host }
.map {|u| Target.new(url: u, depth: target.depth + 1)}
targets += ts
rescue URI::InvalidURIError, OpenURI::HTTPError, Net::OpenTimeout
next
rescue SocketError, OpenSSL::SSL::SSLError, Errno::ECONNREFUSED
break
end
end
puts "done: #{root_url}"
end
def fetch(link)
link.fragment = ''
URI.open(link)
end
def save(url, content, wait)
puts "%s: %s, %2.2f" % [url.to_s, content.size, wait]
end
end
class Crawler
include Traversable
def self.run(*args)
new(*args)
end
def initialize(*args)
traverse(*args)
end
def extract_urls(content)
Nokogiri::HTML(content).xpath('//a[not(starts-with(@href, "mailto"))]')
.map {|node| node[:href]&.strip}.reject(&:nil?)
end
end
URL = ['https://docs.ruby-lang.org/ja/latest/method/URI/s/parse.html',
'http://www.glivec.jp/glivecnavi/gist/about.html',
'https://stackoverflow.com/questions/11267266/finding-e-mail-addresses-using-xpath',
'https://www.crccheck.com/blog/xpath-to-find-email-address-links/']
File.read('domains.txt').split("\n").map do |domain|
Thread.new do
Crawler.run("https://#{domain}/")
end
end.map(&:join)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.