Last active
December 13, 2015 17:38
-
-
Save natarius/4949194 to your computer and use it in GitHub Desktop.
Crawl email addresses from a site including recursive following all links on that domain.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class EmailCrawler | |
require 'rubygems' | |
require 'mechanize' | |
require 'parallel' | |
attr_accessor :links, :crawl_host, :parsed_sites | |
def crawl(url) | |
@links = Array.new | |
@parsed_sites = Array.new | |
@crawl_host = URI(url).host | |
agent = Mechanize.new | |
page = agent.get(url) | |
begin | |
crawl_site(page) | |
rescue => e | |
puts e | |
end | |
get_emails | |
end | |
private | |
def crawl_site(page) | |
return if @parsed_sites.include?(page.uri) | |
@parsed_sites << page.uri | |
puts "crawling site: #{page.uri}" | |
crawl_emails_from(page) | |
Parallel.each(page.links, :in_threads => 3) do |link| | |
next if link.uri.blank? or | |
!link.uri.host.blank? or | |
link.class != Mechanize::Page::Link or | |
link.uri.scheme == "mailto" | |
begin | |
crawl_site(link.click) | |
rescue => e | |
puts e | |
end | |
end | |
page.iframes.each do |iframe| | |
next if iframe.class != Mechanize::Page::Frame or iframe.src.blank? | |
iframe_agent = Mechanize.new | |
page = iframe_agent.get(iframe.src) | |
crawl_site(page) | |
end | |
end | |
def crawl_emails_from(page) | |
page.search("a").each do |link| | |
element = link.attr("href").to_s.downcase | |
if !link.content.blank? and element.starts_with?("mailto:") | |
email = CGI.unescape(element.gsub("mailto:", "")) | |
add_email(email.strip) | |
end | |
end | |
end | |
def add_email(email_address) | |
puts "Added #{email_address}" | |
@links << email_address.downcase unless email_address.blank? | |
puts "Crawled #{@links.size} email addresses" | |
end | |
def get_emails | |
@links.uniq | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment