Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
require 'mechanize'
require 'logger'
require_relative 'random_agent'
require_relative 'path'
# Mechanize::Page subclasses Mechanize::File, Mechanize::Download is its own thing
# module MimeInfo
# def size
# header["content-length"].to_i
# end
# def mime_type
# header["content-type"] || "application/octet-stream"
# end
# def link_info
# "#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}"
# end
# end
module Mechanize::Parser
REDIRECT_CODES = [301, 302, 303, 307]
# include MimeInfo
attr_accessor :cached_response, :saved_response
def size
header["content-length"].to_i
end
def mime_type
header["content-type"] || "application/octet-stream"
end
def link_info
"#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}"
end
def code
@code.to_i
end
def url
@uri.to_s
end
def error?
code != 200 and not redirect?
end
def redirect?
REDIRECT_CODES.include? code
end
def redirect
header["location"]
end
def proxy
"#{mech.proxy_addr}:#{mech.proxy_port}" if mech.proxy_addr and mech.proxy_port
end
def body_with_headers
output = []
getstr = "GET #{uri}"
getstr += " (proxy: #{proxy})" if proxy
getstr += " [cached]" if cached_response?
output << getstr
output << ""
output << "HTTP/1.1 #{code}"
canonical_each do |header, val|
output << "#{header}: #{val}"
end
output << ""
output << body
output.join("\n")
end
def cached_response?
cached_response
end
alias_method :cached?, :cached_response?
end
class HttpClient
CACHEDIR = Path["pagecache/"]
def initialize( random_agent: true,
retries: 0,
cookie_file: nil,
verbose: nil,
proxy: nil,
relative_to: nil,
cache_timeout: nil,
raise_exceptions: true,
follow_redirects: true,
logger: nil )
@cookie_file = cookie_file
@proxy = proxy
@relative_to = URI.parse(relative_to) if relative_to
@cache_timeout = cache_timeout
@raise_exceptions = raise_exceptions
@follow_redirects = follow_redirects
@logger = logger
@random_agent = random_agent
@retries = @proxy ? retries : 0 # only retry when using a proxy
@lock = Mutex.new
end
def logger
@logger ||= Rails.logger #Logger.new $stdout
end
def client
@client ||= new_client
end
def new_client
Mechanize.new do |a|
# TODO: Cleanup Proxy model (remove "useragent", rename "url" to "host", replace fields whose values are "NULL" or "" with nil)
if @proxy
a.set_proxy(@proxy.host, @proxy.port, @proxy.username, @proxy.password)
end
if @random_agent
a.user_agent = RandomAgent.get
else
a.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4"
end
a.max_history = 0
a.log = logger
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
a.idle_timeout = 30
a.read_timeout = 30
if @follow_redirects
a.redirect_ok = true
else
a.redirect_ok = false
a.agent.allowed_error_codes += [301, 302, 303, 307]
end
a.cookie_jar.load(@cookie_file) if @cookie_file and File.exists? @cookie_file
end
end
def ymd_path
path = CACHEDIR/ymd
unless path.exists?
path.mkdir_p
puts "* Created #{path}"
end
path
end
def today_path
today = CACHEDIR/"today"
if not today.exists?
ymd_path.ln_s today
elsif ymd_path.dirs.last != today.symlink_target.dirs.last
today.rm
ymd_path.ln_s today
puts "* Symlinked #{today} to #{ymd_path}"
end
today
end
def ymd
Time.now.strftime("%Y-%m-%d")
end
def timestamp
Time.now.strftime("%H:%M:%S.%L")
end
def read_cache_path
path = CACHEDIR/"readcache"
path.mkdir_p unless path.exists?
path
end
def url_hash(url)
Digest::SHA1.hexdigest(url.to_s)
end
def cache_file_for(url)
read_cache_path/"#{url_hash(url)}.html"
end
def get_cached_response_for(url, cache_timeout)
url = @relative_to+url if @relative_to
file = cache_file_for(url)
p url: url, cache: file.to_s
if file.exists? and cache_timeout
expiry_date = file.mtime + cache_timeout
file.rm if Time.now > expiry_date
end
if file.exists?
Mechanize::Page.new( # uri=nil, response=nil, body=nil, code=nil, mech=nil)
URI.parse(url), # uri
{'content-type'=>"text/html"}, # response
file.read, # body
200, # code
new_client # mechanize instance
).tap { |r| r.cached_response = true }
else
nil
end
end
def log_response(response)
logpath = nil
loop do
logpath = today_path/"#{timestamp}.html"
break if not logpath.exists?
end
logpath.write response.body_with_headers
response.saved_response = logpath
response
end
def save_to_cache(response, url)
if response.code == 200
#url = response.uri.to_s
cachefile = cache_file_for(url)
cachefile.write response.body
logger.info "Caching url: #{url}"
logger.info "Cache file: #{cachefile}"
end
end
def get(url, cache_timeout: nil)
# TODO: Ensure that there's no race condition in saving the cookie file
# TODO: follow_redirects should disable exceptions for 30{1,2,7}
url = url.to_s
cache_timeout = cache_timeout || @cache_timeout
infoline = "--- HTTP GET: #{url}"
infoline << " (cache timeout: #{cache_timeout})" if cache_timeout
infoline << " --------------------------------------------"
puts infoline
response = get_cached_response_for(url, cache_timeout) if cache_timeout
unless response
# when google blocks us, it raises: Mechanize::ResponseCodeError (503 => Net::HTTPServiceUnavailable for http://scholar.google.com/sorry/?continue=http://scholar.google.com/scholar%3Fstart%3D10%26q%3Dauthor:%2522Alexandra%2BGheciu%2522%26hl%3Den%26as_sdt%3D0,5 -- unhandled response)
tries = 0
response = begin
tries += 1
client.get(url)
rescue Mechanize::ResponseCodeError => e
if @retries and e.page.code == 503 and tries < @retries
retry
end
if @raise_exceptions
raise e
else
e.page
end
end
return response if not response.is_a? Mechanize::Page
response = log_response(response)
save_to_cache(response, url) if cache_timeout
end
# if @raise_exceptions == false
# if @follow_redirects == false and [301, 302, 307].include? response.code
# # don't follow the redirect, just return the response
# elsif response.code != 200
# raise "HTTP GET error #{response.code}. Response has been saved to: #{response.saved_response}"
# end
# end
client.cookie_jar.save(@cookie_file) if @cookie_file
puts response.link_info
puts
response
end
def self.get(url, **opts)
opts_for_new = opts.extract!(:cookie_file, :random_agent, :logger)
new(opts_for_new).get(url, opts)
end
end
HTTPClient = HttpClient
if $0 == __FILE__
require 'active_support/core_ext'
puts "* Getting page..."
c = HTTPClient.new(cache_timeout: 15.seconds)
# doc = HTTPClient.get("http://slashdot.org/", cache_timeout: 15.seconds)
doc = c.get("http://slashdot.org/")
p size: doc.body.size, code: doc.code, url: doc.uri, cached: doc.cached?
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.