jteneycke/http_client.rb

## http_client.rb
require 'mechanize'
require 'logger'
require_relative 'random_agent'
require_relative 'path'

# Mechanize::Page subclasses Mechanize::File, Mechanize::Download is its own thing

# module MimeInfo
#   def size
#     header["content-length"].to_i
#   end

#   def mime_type
#     header["content-type"] || "application/octet-stream"
#   end

#   def link_info
#     "#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}"
#   end
# end

module Mechanize::Parser

  REDIRECT_CODES = [301, 302, 303, 307]

  # include MimeInfo

  attr_accessor :cached_response, :saved_response

  def size
    header["content-length"].to_i
  end

  def mime_type
    header["content-type"] || "application/octet-stream"
  end

  def link_info
    "#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}"
  end

  def code
    @code.to_i
  end

  def url
    @uri.to_s
  end

  def error?
    code != 200 and not redirect?
  end

  def redirect?
    REDIRECT_CODES.include? code
  end

  def redirect
    header["location"]
  end

  def proxy
    "#{mech.proxy_addr}:#{mech.proxy_port}" if mech.proxy_addr and mech.proxy_port
  end

  def body_with_headers
    output = []

    getstr = "GET #{uri}"
    getstr += " (proxy: #{proxy})" if proxy
    getstr += " [cached]" if cached_response?

    output << getstr
    output << ""

    output << "HTTP/1.1 #{code}"

    canonical_each do |header, val|
      output << "#{header}: #{val}"
    end

    output << ""
    output << body

    output.join("\n")
  end

  def cached_response?
    cached_response
  end

  alias_method :cached?, :cached_response?

end


class HttpClient

  CACHEDIR = Path["pagecache/"]

  def initialize( random_agent:     true,
                  retries:          0,
                  cookie_file:      nil,
                  verbose:          nil,
                  proxy:            nil,
                  relative_to:      nil,
                  cache_timeout:    nil,
                  raise_exceptions: true,
                  follow_redirects: true,
                  logger:           nil     )

    @cookie_file      = cookie_file
    @proxy            = proxy
    @relative_to      = URI.parse(relative_to) if relative_to
    @cache_timeout    = cache_timeout
    @raise_exceptions = raise_exceptions
    @follow_redirects = follow_redirects
    @logger           = logger
    @random_agent     = random_agent
    @retries          = @proxy ? retries : 0  # only retry when using a proxy

    @lock             = Mutex.new
  end

  def logger
    @logger ||= Rails.logger #Logger.new $stdout
  end

  def client
    @client ||= new_client
  end

  def new_client
    Mechanize.new do |a|

      # TODO: Cleanup Proxy model (remove "useragent", rename "url" to "host", replace fields whose values are "NULL" or "" with nil)

      if @proxy
        a.set_proxy(@proxy.host, @proxy.port, @proxy.username, @proxy.password)
      end

      if @random_agent
        a.user_agent = RandomAgent.get
      else
        a.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4"
      end

      a.max_history  = 0
      a.log          = logger
      a.verify_mode  = OpenSSL::SSL::VERIFY_NONE
      a.idle_timeout = 30
      a.read_timeout = 30

      if @follow_redirects
        a.redirect_ok = true
      else
        a.redirect_ok = false
        a.agent.allowed_error_codes += [301, 302, 303, 307]
      end

      a.cookie_jar.load(@cookie_file) if @cookie_file and File.exists? @cookie_file
    end
  end

  def ymd_path
    path = CACHEDIR/ymd
    unless path.exists?
      path.mkdir_p
      puts "* Created #{path}"
    end
    path
  end

  def today_path
    today = CACHEDIR/"today"
    if not today.exists?
      ymd_path.ln_s today
    elsif ymd_path.dirs.last != today.symlink_target.dirs.last
      today.rm
      ymd_path.ln_s today
      puts "* Symlinked #{today} to #{ymd_path}"
    end
    today
  end

  def ymd
    Time.now.strftime("%Y-%m-%d")
  end

  def timestamp
    Time.now.strftime("%H:%M:%S.%L")
  end

  def read_cache_path
    path = CACHEDIR/"readcache"
    path.mkdir_p unless path.exists?
    path
  end

  def url_hash(url)
    Digest::SHA1.hexdigest(url.to_s)
  end

  def cache_file_for(url)
    read_cache_path/"#{url_hash(url)}.html"
  end

  def get_cached_response_for(url, cache_timeout)
    url = @relative_to+url if @relative_to

    file = cache_file_for(url)

    p url: url, cache: file.to_s

    if file.exists? and cache_timeout
      expiry_date = file.mtime + cache_timeout
      file.rm if Time.now > expiry_date
    end

    if file.exists?
      Mechanize::Page.new( # uri=nil, response=nil, body=nil, code=nil, mech=nil)
        URI.parse(url),                 # uri
        {'content-type'=>"text/html"},  # response
        file.read,                      # body
        200,                            # code
        new_client                      # mechanize instance
      ).tap { |r| r.cached_response = true }
    else
      nil
    end
  end


  def log_response(response)
    logpath = nil
    loop do
      logpath = today_path/"#{timestamp}.html"
      break if not logpath.exists?
    end

    logpath.write response.body_with_headers
    response.saved_response = logpath

    response
  end


  def save_to_cache(response, url)
    if response.code == 200
      #url = response.uri.to_s
      cachefile = cache_file_for(url)
      cachefile.write response.body
      logger.info "Caching url: #{url}"
      logger.info "Cache file: #{cachefile}"
    end
  end


  def get(url, cache_timeout: nil)
    # TODO: Ensure that there's no race condition in saving the cookie file
    # TODO: follow_redirects should disable exceptions for 30{1,2,7}

    url = url.to_s
    cache_timeout = cache_timeout || @cache_timeout

    infoline = "--- HTTP GET: #{url}"
    infoline << " (cache timeout: #{cache_timeout})" if cache_timeout
    infoline << " --------------------------------------------"
    puts infoline

    response = get_cached_response_for(url, cache_timeout) if cache_timeout

    unless response
      # when google blocks us, it raises: Mechanize::ResponseCodeError (503 => Net::HTTPServiceUnavailable for http://scholar.google.com/sorry/?continue=http://scholar.google.com/scholar%3Fstart%3D10%26q%3Dauthor:%2522Alexandra%2BGheciu%2522%26hl%3Den%26as_sdt%3D0,5 -- unhandled response)
      tries = 0

      response = begin
        tries += 1
        client.get(url)
      rescue Mechanize::ResponseCodeError => e
        if @retries and e.page.code == 503 and tries < @retries
          retry
        end

        if @raise_exceptions
          raise e
        else
          e.page
        end
      end

      return response if not response.is_a? Mechanize::Page

      response = log_response(response)
      save_to_cache(response, url) if cache_timeout
    end

    # if @raise_exceptions == false
    #   if @follow_redirects == false and [301, 302, 307].include? response.code
    #     # don't follow the redirect, just return the response
    #   elsif response.code != 200
    #     raise "HTTP GET error #{response.code}. Response has been saved to: #{response.saved_response}"
    #   end
    # end

    client.cookie_jar.save(@cookie_file) if @cookie_file

    puts response.link_info
    puts

    response
  end

  def self.get(url, **opts)
    opts_for_new = opts.extract!(:cookie_file, :random_agent, :logger)
    new(opts_for_new).get(url, opts)
  end

end

HTTPClient = HttpClient

if $0 == __FILE__
  require 'active_support/core_ext'

  puts "* Getting page..."
  c = HTTPClient.new(cache_timeout: 15.seconds)
  # doc = HTTPClient.get("http://slashdot.org/", cache_timeout: 15.seconds)
  doc = c.get("http://slashdot.org/")

  p size: doc.body.size, code: doc.code, url: doc.uri, cached: doc.cached?
end
	require 'mechanize'
	require 'logger'
	require_relative 'random_agent'
	require_relative 'path'

	# Mechanize::Page subclasses Mechanize::File, Mechanize::Download is its own thing

	# module MimeInfo
	# def size
	# header["content-length"].to_i
	# end

	# def mime_type
	# header["content-type"] \|\| "application/octet-stream"
	# end

	# def link_info
	# "#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}"
	# end
	# end

	module Mechanize::Parser

	REDIRECT_CODES = [301, 302, 303, 307]

	# include MimeInfo

	attr_accessor :cached_response, :saved_response

	def size
	header["content-length"].to_i
	end

	def mime_type
	header["content-type"] \|\| "application/octet-stream"
	end

	def link_info
	"#{mime_type}#{size <= 0 ? "" : " (size: #{size} bytes)"}"
	end

	def code
	@code.to_i
	end

	def url
	@uri.to_s
	end

	def error?
	code != 200 and not redirect?
	end

	def redirect?
	REDIRECT_CODES.include? code
	end

	def redirect
	header["location"]
	end

	def proxy
	"#{mech.proxy_addr}:#{mech.proxy_port}" if mech.proxy_addr and mech.proxy_port
	end

	def body_with_headers
	output = []

	getstr = "GET #{uri}"
	getstr += " (proxy: #{proxy})" if proxy
	getstr += " [cached]" if cached_response?

	output << getstr
	output << ""

	output << "HTTP/1.1 #{code}"

	canonical_each do \|header, val\|
	output << "#{header}: #{val}"
	end

	output << ""
	output << body

	output.join("\n")
	end

	def cached_response?
	cached_response
	end

	alias_method :cached?, :cached_response?

	end


	class HttpClient

	CACHEDIR = Path["pagecache/"]

	def initialize( random_agent: true,
	retries: 0,
	cookie_file: nil,
	verbose: nil,
	proxy: nil,
	relative_to: nil,
	cache_timeout: nil,
	raise_exceptions: true,
	follow_redirects: true,
	logger: nil )

	@cookie_file = cookie_file
	@proxy = proxy
	@relative_to = URI.parse(relative_to) if relative_to
	@cache_timeout = cache_timeout
	@raise_exceptions = raise_exceptions
	@follow_redirects = follow_redirects
	@logger = logger
	@random_agent = random_agent
	@retries = @proxy ? retries : 0 # only retry when using a proxy

	@lock = Mutex.new
	end

	def logger
	@logger \|\|= Rails.logger #Logger.new $stdout
	end

	def client
	@client \|\|= new_client
	end

	def new_client
	Mechanize.new do \|a\|

	# TODO: Cleanup Proxy model (remove "useragent", rename "url" to "host", replace fields whose values are "NULL" or "" with nil)

	if @proxy
	a.set_proxy(@proxy.host, @proxy.port, @proxy.username, @proxy.password)
	end

	if @random_agent
	a.user_agent = RandomAgent.get
	else
	a.user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4"
	end

	a.max_history = 0
	a.log = logger
	a.verify_mode = OpenSSL::SSL::VERIFY_NONE
	a.idle_timeout = 30
	a.read_timeout = 30

	if @follow_redirects
	a.redirect_ok = true
	else
	a.redirect_ok = false
	a.agent.allowed_error_codes += [301, 302, 303, 307]
	end

	a.cookie_jar.load(@cookie_file) if @cookie_file and File.exists? @cookie_file
	end
	end

	def ymd_path
	path = CACHEDIR/ymd
	unless path.exists?
	path.mkdir_p
	puts "* Created #{path}"
	end
	path
	end

	def today_path
	today = CACHEDIR/"today"
	if not today.exists?
	ymd_path.ln_s today
	elsif ymd_path.dirs.last != today.symlink_target.dirs.last
	today.rm
	ymd_path.ln_s today
	puts "* Symlinked #{today} to #{ymd_path}"
	end
	today
	end

	def ymd
	Time.now.strftime("%Y-%m-%d")
	end

	def timestamp
	Time.now.strftime("%H:%M:%S.%L")
	end

	def read_cache_path
	path = CACHEDIR/"readcache"
	path.mkdir_p unless path.exists?
	path
	end

	def url_hash(url)
	Digest::SHA1.hexdigest(url.to_s)
	end

	def cache_file_for(url)
	read_cache_path/"#{url_hash(url)}.html"
	end

	def get_cached_response_for(url, cache_timeout)
	url = @relative_to+url if @relative_to

	file = cache_file_for(url)

	p url: url, cache: file.to_s

	if file.exists? and cache_timeout
	expiry_date = file.mtime + cache_timeout
	file.rm if Time.now > expiry_date
	end

	if file.exists?
	Mechanize::Page.new( # uri=nil, response=nil, body=nil, code=nil, mech=nil)
	URI.parse(url), # uri
	{'content-type'=>"text/html"}, # response
	file.read, # body
	200, # code
	new_client # mechanize instance
	).tap { \|r\| r.cached_response = true }
	else
	nil
	end
	end


	def log_response(response)
	logpath = nil
	loop do
	logpath = today_path/"#{timestamp}.html"
	break if not logpath.exists?
	end

	logpath.write response.body_with_headers
	response.saved_response = logpath

	response
	end


	def save_to_cache(response, url)
	if response.code == 200
	#url = response.uri.to_s
	cachefile = cache_file_for(url)
	cachefile.write response.body
	logger.info "Caching url: #{url}"
	logger.info "Cache file: #{cachefile}"
	end
	end


	def get(url, cache_timeout: nil)
	# TODO: Ensure that there's no race condition in saving the cookie file
	# TODO: follow_redirects should disable exceptions for 30{1,2,7}

	url = url.to_s
	cache_timeout = cache_timeout \|\| @cache_timeout

	infoline = "--- HTTP GET: #{url}"
	infoline << " (cache timeout: #{cache_timeout})" if cache_timeout
	infoline << " --------------------------------------------"
	puts infoline

	response = get_cached_response_for(url, cache_timeout) if cache_timeout

	unless response
	# when google blocks us, it raises: Mechanize::ResponseCodeError (503 => Net::HTTPServiceUnavailable for http://scholar.google.com/sorry/?continue=http://scholar.google.com/scholar%3Fstart%3D10%26q%3Dauthor:%2522Alexandra%2BGheciu%2522%26hl%3Den%26as_sdt%3D0,5 -- unhandled response)
	tries = 0

	response = begin
	tries += 1
	client.get(url)
	rescue Mechanize::ResponseCodeError => e
	if @retries and e.page.code == 503 and tries < @retries
	retry
	end

	if @raise_exceptions
	raise e
	else
	e.page
	end
	end

	return response if not response.is_a? Mechanize::Page

	response = log_response(response)
	save_to_cache(response, url) if cache_timeout
	end

	# if @raise_exceptions == false
	# if @follow_redirects == false and [301, 302, 307].include? response.code
	# # don't follow the redirect, just return the response
	# elsif response.code != 200
	# raise "HTTP GET error #{response.code}. Response has been saved to: #{response.saved_response}"
	# end
	# end

	client.cookie_jar.save(@cookie_file) if @cookie_file

	puts response.link_info
	puts

	response
	end

	def self.get(url, **opts)
	opts_for_new = opts.extract!(:cookie_file, :random_agent, :logger)
	new(opts_for_new).get(url, opts)
	end

	end

	HTTPClient = HttpClient

	if $0 == __FILE__
	require 'active_support/core_ext'

	puts "* Getting page..."
	c = HTTPClient.new(cache_timeout: 15.seconds)
	# doc = HTTPClient.get("http://slashdot.org/", cache_timeout: 15.seconds)
	doc = c.get("http://slashdot.org/")

	p size: doc.body.size, code: doc.code, url: doc.uri, cached: doc.cached?
	end