Created
August 22, 2012 09:17
-
-
Save bnagy/3423977 to your computer and use it in GitHub Desktop.
threadsafe search engine automation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'capybara' | |
require 'capybara/dsl' | |
require 'capybara/poltergeist' | |
Capybara.configure do |config| | |
config.run_server = false | |
config.default_driver = :poltergeist | |
end | |
module Searchers | |
class CaptchaError < StandardError; end | |
class Search | |
include Capybara::DSL | |
DEFAULTS={lim: 100, delay: 3} | |
class << self | |
def href href = nil | |
@href = href if href | |
@href | |
end | |
end | |
attr_reader :base_href, :page | |
def initialize href=self.class.href | |
@base_href = URI(href) | |
# Capybara requires all absolute URLs to start with http. | |
unless @base_href.scheme =~ /^http/ | |
raise ArgumentError, "base_href must be of http(s) scheme" | |
end | |
# Overridden, to make sure we have one session per agent. | |
@page = Capybara::Session.new( Capybara.default_driver ) | |
visit base_href.path || '/' | |
end | |
def visit url | |
base_href.path = URI(url).path | |
super(base_href.to_s) | |
end | |
def captcha? | |
page.has_field? "captcha" | |
end | |
def setup query | |
fill_in "q", :with => query | |
click_on "Search" | |
end | |
def select_results; end | |
def extract_url result | |
result[:href] | |
end | |
def next_page; click_on 'Next'; end | |
def search query, opts={} | |
setup query | |
opts=DEFAULTS.merge opts | |
urls=[] | |
url_count=0 | |
loop do | |
begin | |
raise CaptchaError if captcha? | |
results=select_results | |
break if results.empty? | |
results.each {|e| | |
url_count+=1 | |
if block_given? | |
yield extract_url( e ) | |
else | |
urls << extract_url( e ) | |
end | |
} | |
break if url_count >= opts[:lim] | |
sleep( rand(opts[:delay]*10.0)/10 ) | |
next_page | |
rescue Capybara::Poltergeist::JavascriptError | |
#ignore | |
rescue | |
warn "#{self}:#{__method__}: #{$!}" | |
break | |
end | |
end | |
return urls unless block_given? | |
end | |
end | |
class Google < Search | |
href 'https://www.google.com' | |
def select_results | |
all "h3.r a" | |
end | |
def extract_url result | |
# google returns redirect links to itself | |
result[:href][%r{/url\?q=(.*?)&sa},1] | |
end | |
end | |
class Bing < Search | |
href 'http://www.bing.com' | |
def select_results | |
all 'h3 a' | |
end | |
end | |
class DuckDuckGo < Search | |
href 'http://www.duckduckgo.com/html' | |
def select_results | |
all 'div.results_links div.links_deep a' | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment