Skip to content

Instantly share code, notes, and snippets.

@natarius
Last active December 13, 2015 17:38
Show Gist options
  • Save natarius/4949194 to your computer and use it in GitHub Desktop.
Save natarius/4949194 to your computer and use it in GitHub Desktop.
Crawl email addresses from a site including recursive following all links on that domain.
class EmailCrawler
require 'rubygems'
require 'mechanize'
require 'parallel'
attr_accessor :links, :crawl_host, :parsed_sites
def crawl(url)
@links = Array.new
@parsed_sites = Array.new
@crawl_host = URI(url).host
agent = Mechanize.new
page = agent.get(url)
begin
crawl_site(page)
rescue => e
puts e
end
get_emails
end
private
def crawl_site(page)
return if @parsed_sites.include?(page.uri)
@parsed_sites << page.uri
puts "crawling site: #{page.uri}"
crawl_emails_from(page)
Parallel.each(page.links, :in_threads => 3) do |link|
next if link.uri.blank? or
!link.uri.host.blank? or
link.class != Mechanize::Page::Link or
link.uri.scheme == "mailto"
begin
crawl_site(link.click)
rescue => e
puts e
end
end
page.iframes.each do |iframe|
next if iframe.class != Mechanize::Page::Frame or iframe.src.blank?
iframe_agent = Mechanize.new
page = iframe_agent.get(iframe.src)
crawl_site(page)
end
end
def crawl_emails_from(page)
page.search("a").each do |link|
element = link.attr("href").to_s.downcase
if !link.content.blank? and element.starts_with?("mailto:")
email = CGI.unescape(element.gsub("mailto:", ""))
add_email(email.strip)
end
end
end
def add_email(email_address)
puts "Added #{email_address}"
@links << email_address.downcase unless email_address.blank?
puts "Crawled #{@links.size} email addresses"
end
def get_emails
@links.uniq
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment