Skip to content

Instantly share code, notes, and snippets.

@afcapel
Last active April 8, 2023 04:39
Show Gist options
  • Save afcapel/758ad314ca97067de43f67a35fc70aa0 to your computer and use it in GitHub Desktop.
Save afcapel/758ad314ca97067de43f67a35fc70aa0 to your computer and use it in GitHub Desktop.
require "open-uri"
module Imports
class PageCrawler
attr_reader :current_page, :pages, :crawled
attr_accessor :follow_patterns, :ignore_patterns
def initialize(start_url, max_depth: 3, max_pages: 1000)
@max_depth, @max_pages = max_depth, max_pages
@start_url = URI.parse(start_url)
@follow_patterns = []
@ignore_patterns = [/mailto/]
@crawled = 0
@pages = []
@pages << CrawledPage.new(start_url, 0)
end
def crawl(&block)
while @current_page = @pages.find(&:pending?) do
@current_page.crawl(&block)
@crawled += 1
add_next_pages
end
end
private
def add_next_pages
return unless @current_page.depth < @max_depth && @pages.size < @max_pages
links = @current_page.links.collect { |link| normalize(link) }.compact
links = links.select { |link| should_follow?(link) }
@pages += links.collect { |link| CrawledPage.new(link, @current_page.depth + 1) }
@pages = @pages.uniq(&:url).take(@max_pages)
end
def normalize(link)
uri = URI.parse(link)
uri = @start_url.merge(uri) if uri.relative?
return uri
rescue URI::InvalidURIError => ex
Rails.logger.error(ex.message)
nil
end
def should_follow?(link)
return false unless @start_url.host == link.host
return false unless follow_patterns.empty? || follow_patterns.find { |p| p =~ link.path }
return false if ignore_patterns.find { |p| p =~ link.path }
true
end
end
class CrawledPage
attr_reader :url, :html, :doc, :depth
def initialize(url, depth)
@url, @depth = url, depth
@pending = true
end
def crawl(&block)
begin
@html = URI.open(@url.to_s).read
block.call(@url, @html)
rescue OpenURI::HTTPError => ex
Rails.logger.error(ex.message)
ensure
@pending = false
end
end
def links
@doc ||= Nokogiri::HTML(@html)
@links ||= @doc.xpath("//a").collect { |a| a["href"] }.reject(&:blank?)
end
def pending?
@pending
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment