Skip to content

Instantly share code, notes, and snippets.

@bnagy
Created August 22, 2012 09:17
Show Gist options
  • Save bnagy/3423977 to your computer and use it in GitHub Desktop.
Save bnagy/3423977 to your computer and use it in GitHub Desktop.
threadsafe search engine automation
require 'capybara'
require 'capybara/dsl'
require 'capybara/poltergeist'
Capybara.configure do |config|
config.run_server = false
config.default_driver = :poltergeist
end
module Searchers
class CaptchaError < StandardError; end
class Search
include Capybara::DSL
DEFAULTS={lim: 100, delay: 3}
class << self
def href href = nil
@href = href if href
@href
end
end
attr_reader :base_href, :page
def initialize href=self.class.href
@base_href = URI(href)
# Capybara requires all absolute URLs to start with http.
unless @base_href.scheme =~ /^http/
raise ArgumentError, "base_href must be of http(s) scheme"
end
# Overridden, to make sure we have one session per agent.
@page = Capybara::Session.new( Capybara.default_driver )
visit base_href.path || '/'
end
def visit url
base_href.path = URI(url).path
super(base_href.to_s)
end
def captcha?
page.has_field? "captcha"
end
def setup query
fill_in "q", :with => query
click_on "Search"
end
def select_results; end
def extract_url result
result[:href]
end
def next_page; click_on 'Next'; end
def search query, opts={}
setup query
opts=DEFAULTS.merge opts
urls=[]
url_count=0
loop do
begin
raise CaptchaError if captcha?
results=select_results
break if results.empty?
results.each {|e|
url_count+=1
if block_given?
yield extract_url( e )
else
urls << extract_url( e )
end
}
break if url_count >= opts[:lim]
sleep( rand(opts[:delay]*10.0)/10 )
next_page
rescue Capybara::Poltergeist::JavascriptError
#ignore
rescue
warn "#{self}:#{__method__}: #{$!}"
break
end
end
return urls unless block_given?
end
end
class Google < Search
href 'https://www.google.com'
def select_results
all "h3.r a"
end
def extract_url result
# google returns redirect links to itself
result[:href][%r{/url\?q=(.*?)&sa},1]
end
end
class Bing < Search
href 'http://www.bing.com'
def select_results
all 'h3 a'
end
end
class DuckDuckGo < Search
href 'http://www.duckduckgo.com/html'
def select_results
all 'div.results_links div.links_deep a'
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment