Skip to content

Instantly share code, notes, and snippets.

@leonid-shevtsov
Created October 13, 2012 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leonid-shevtsov/3885571 to your computer and use it in GitHub Desktop.
Save leonid-shevtsov/3885571 to your computer and use it in GitHub Desktop.
$ python2.6 test.py
(<type 'exceptions.OSError'>, OSError(32, 'Broken pipe'), <traceback object at 0x1096aa950>)
Traceback (most recent call last):
File "test.py", line 9, in <module>
lines=sp.communicate(inp)[0]
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/subprocess.py", line 691, in communicate
return self._communicate(input)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/subprocess.py", line 1222, in _communicate
bytes_written = os.write(self.stdin.fileno(), chunk)
OSError: [Errno 32] Broken pipe
#!/usr/bin/env python2.6
# The code works on Python 2.7.2, but breaks on Python 2.6.7 (Sublime Text 2 has 2.6.2 IIRC)
import subprocess
import sys
code = open('st2-crash.rb').read()
try:
linter = subprocess.Popen(['rvm-auto-ruby','-wc'], stdin=subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
#this works:
#linter.stdin.write(code)
#lines=linter.communicate()[0]
#this raises an EPIPE
lines=linter.communicate(code)[0]
except:
print sys.exc_info()
raise
require 'tempfile'
require 'net/ntlm'
require 'kconv'
require 'webrobots'
##
# An HTTP (and local disk access) user agent. This class is an implementation
# detail and is subject to change at any time.
class Mechanize::HTTP::Agent
# :section: Headers
# Disables If-Modified-Since conditional requests (enabled by default)
attr_accessor :conditional_requests
# Is gzip compression of requests enabled?
attr_accessor :gzip_enabled
# A hash of request headers to be used for every request
attr_accessor :request_headers
# The User-Agent header to send
attr_reader :user_agent
# :section: History
# history of requests made
attr_accessor :history
# :section: Hooks
# A list of hooks to call after retrieving a response. Hooks are called with
# the agent and the response returned.
attr_reader :post_connect_hooks
# A list of hooks to call before making a request. Hooks are called with
# the agent and the request to be performed.
attr_reader :pre_connect_hooks
# A list of hooks to call to handle the content-encoding of a request.
attr_reader :content_encoding_hooks
# :section: HTTP Authentication
attr_reader :auth_store # :nodoc:
attr_reader :authenticate_methods # :nodoc:
attr_reader :digest_challenges # :nodoc:
# :section: Redirection
# Follow HTML meta refresh and HTTP Refresh. If set to +:anywhere+ meta
# refresh tags outside of the head element will be followed.
attr_accessor :follow_meta_refresh
# Follow an HTML meta refresh that has no "url=" in the content attribute.
#
# Defaults to false to prevent infinite refresh loops.
attr_accessor :follow_meta_refresh_self
# Controls how this agent deals with redirects. The following values are
# allowed:
#
# :all, true:: All 3xx redirects are followed (default)
# :permanent:: Only 301 Moved Permanantly redirects are followed
# false:: No redirects are followed
attr_accessor :redirect_ok
# Maximum number of redirects to follow
attr_accessor :redirection_limit
# :section: Allowed error codes
# List of error codes to handle without raising an exception.
attr_accessor :allowed_error_codes
# :section: Robots
# When true, this agent will consult the site's robots.txt for each access.
attr_reader :robots
# :section: SSL
# OpenSSL key password
attr_accessor :pass
# :section: Timeouts
# Set to false to disable HTTP/1.1 keep-alive requests
attr_accessor :keep_alive
# Length of time to wait until a connection is opened in seconds
attr_accessor :open_timeout
# Length of time to attempt to read data from the server
attr_accessor :read_timeout
# :section:
# The cookies for this agent
attr_accessor :cookie_jar
# Responses larger than this will be written to a Tempfile instead of stored
# in memory. Setting this to nil disables creation of Tempfiles.
attr_accessor :max_file_buffer
# :section: Utility
# The context parses responses into pages
attr_accessor :context
attr_reader :http # :nodoc:
# When set to true mechanize will ignore an EOF during chunked transfer
# encoding so long as at least one byte was received. Be careful when
# enabling this as it may cause data loss.
attr_accessor :ignore_bad_chunking
# Handlers for various URI schemes
attr_accessor :scheme_handlers
# :section:
# Creates a new Mechanize HTTP user agent. The user agent is an
# implementation detail of mechanize and its API may change at any time.
def initialize
@allowed_error_codes = []
@conditional_requests = true
@context = nil
@content_encoding_hooks = []
@cookie_jar = Mechanize::CookieJar.new
@follow_meta_refresh = false
@follow_meta_refresh_self = false
@gzip_enabled = true
@history = Mechanize::History.new
@ignore_bad_chunking = false
@keep_alive = true
@max_file_buffer = 100_000 # 5MB for response bodies
@open_timeout = nil
@post_connect_hooks = []
@pre_connect_hooks = []
@read_timeout = nil
@redirect_ok = true
@redirection_limit = 20
@request_headers = {}
@robots = false
@user_agent = nil
@webrobots = nil
# HTTP Authentication
@auth_store = Mechanize::HTTP::AuthStore.new
@authenticate_parser = Mechanize::HTTP::WWWAuthenticateParser.new
@authenticate_methods = Hash.new do |methods, uri|
methods[uri] = Hash.new do |realms, auth_scheme|
realms[auth_scheme] = []
end
end
@digest_auth = Net::HTTP::DigestAuth.new
@digest_challenges = {}
# SSL
@pass = nil
@scheme_handlers = Hash.new { |h, scheme|
h[scheme] = lambda { |link, page|
raise Mechanize::UnsupportedSchemeError, scheme
}
}
@scheme_handlers['http'] = lambda { |link, page| link }
@scheme_handlers['https'] = @scheme_handlers['http']
@scheme_handlers['relative'] = @scheme_handlers['http']
@scheme_handlers['file'] = @scheme_handlers['http']
@http = Net::HTTP::Persistent.new 'mechanize'
@http.idle_timeout = 5
@http.keep_alive = 300
end
##
# Adds credentials +user+, +pass+ for +uri+. If +realm+ is set the
# credentials are used only for that realm. If +realm+ is not set the
# credentials become the default for any realm on that URI.
#
# +domain+ and +realm+ are exclusive as NTLM does not follow RFC 2617. If
# +domain+ is given it is only used for NTLM authentication.
def add_auth uri, user, password, realm = nil, domain = nil
@auth_store.add_auth uri, user, password, realm, domain
end
##
# USE OF add_default_auth IS NOT RECOMMENDED AS IT MAY EXPOSE PASSWORDS TO
# THIRD PARTIES
#
# Adds credentials +user+, +pass+ as the default authentication credentials.
# If no other credentials are available these will be returned from
# credentials_for.
#
# If +domain+ is given it is only used for NTLM authentication.
def add_default_auth user, password, domain = nil # :nodoc:
@auth_store.add_default_auth user, password, domain
end
##
# Retrieves +uri+ and parses it into a page or other object according to
# PluggableParser. If the URI is an HTTP or HTTPS scheme URI the given HTTP
# +method+ is used to retrieve it, along with the HTTP +headers+, request
# +params+ and HTTP +referer+.
#
# +redirects+ tracks the number of redirects experienced when retrieving the
# page. If it is over the redirection_limit an error will be raised.
def fetch uri, method = :get, headers = {}, params = [],
referer = current_page, redirects = 0
referer_uri = referer ? referer.uri : nil
uri = resolve uri, referer
uri, params = resolve_parameters uri, method, params
request = http_request uri, method, params
connection = connection_for uri
request_auth request, uri
disable_keep_alive request
enable_gzip request
request_language_charset request
request_cookies request, uri
request_host request, uri
request_referer request, uri, referer_uri
request_user_agent request
request_add_headers request, headers
pre_connect request
# Consult robots.txt
if robots && uri.is_a?(URI::HTTP)
robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
end
# Add If-Modified-Since if page is in history
page = visited_page(uri)
if (page = visited_page(uri)) and page.response['Last-Modified']
request['If-Modified-Since'] = page.response['Last-Modified']
end if(@conditional_requests)
# Specify timeouts if given
connection.open_timeout = @open_timeout if @open_timeout
connection.read_timeout = @read_timeout if @read_timeout
request_log request
response_body_io = nil
# Send the request
begin
response = connection.request(uri, request) { |res|
response_log res
response_body_io = response_read res, request, uri
res
}
rescue Mechanize::ChunkedTerminationError => e
raise unless @ignore_bad_chunking
response = e.response
response_body_io = e.body_io
end
hook_content_encoding response, uri, response_body_io
response_body_io = response_content_encoding response, response_body_io if
request.response_body_permitted?
post_connect uri, response, response_body_io
page = response_parse response, response_body_io, uri
response_cookies response, uri, page
meta = response_follow_meta_refresh response, uri, page, redirects
return meta if meta
case response
when Net::HTTPSuccess
if robots && page.is_a?(Mechanize::Page)
page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
end
page
when Mechanize::FileResponse
page
when Net::HTTPNotModified
log.debug("Got cached page") if log
visited_page(uri) || page
when Net::HTTPRedirection
response_redirect response, method, page, redirects, headers, referer
when Net::HTTPUnauthorized
response_authenticate(response, page, uri, request, headers, params,
referer)
else
if @allowed_error_codes.include? page.
if robots && page.is_a?(Mechanize::Page)
page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
end
page
else
raise Mechanize::ResponseCodeError.new(page, 'unhandled response')
end
end
end
# URI for a proxy connection
def proxy_uri
@http.proxy_uri
end
# Retry non-idempotent requests?
def retry_change_requests
@http.retry_change_requests
end
# Retry non-idempotent requests
def retry_change_requests= retri
@http.retry_change_requests = retri
end
# :section: Headers
def user_agent= user_agent
@webrobots = nil if user_agent != @user_agent
@user_agent = user_agent
end
# :section: History
# Equivalent to the browser back button. Returns the most recent page
# visited.
def back
@history.pop
end
##
# Returns the latest page loaded by the agent
def current_page
@history.last
end
def max_history
@history.max_size
end
def max_history=(length)
@history.max_size = length
end
# Returns a visited page for the url passed in, otherwise nil
def visited_page url
@history.visited_page resolve url
end
# :section: Hooks
def hook_content_encoding response, uri, response_body_io
@content_encoding_hooks.each do |hook|
hook.call self, uri, response, response_body_io
end
end
##
# Invokes hooks added to post_connect_hooks after a +response+ is returned
# and the response +body+ is handled.
#
# Yields the +context+, the +uri+ for the request, the +response+ and the
# response +body+.
def post_connect uri, response, body_io # :yields: agent, uri, response, body
@post_connect_hooks.each do |hook|
begin
hook.call self, uri, response, body_io.read
ensure
body_io.rewind
end
end
end
##
# Invokes hooks added to pre_connect_hooks before a +request+ is made.
# Yields the +agent+ and the +request+ that will be performed to each hook.
def pre_connect request # :yields: agent, request
@pre_connect_hooks.each do |hook|
hook.call self, request
end
end
# :section: Request
def connection_for uri
case uri.scheme.downcase
when 'http', 'https' then
return @http
when 'file' then
return Mechanize::FileConnection.new
end
end
##
# Decodes a gzip-encoded +body_io+. If it cannot be decoded, inflate is
# tried followed by raising an error.
def content_encoding_gunzip body_io
log.debug('gzip response') if log
zio = Zlib::GzipReader.new body_io
out_io = auto_io 'mechanize-gunzip', 16384, zio
zio.finish
return out_io
rescue Zlib::Error => gz_error
log.warn "unable to gunzip response: #{gz_error} (#{gz_error.class})" if
log
body_io.rewind
body_io.read 10
begin
log.warn "trying raw inflate on response" if log
return inflate body_io, -Zlib::MAX_WBITS
rescue Zlib::Error => e
log.error "unable to inflate response: #{e} (#{e.class})" if log
raise
end
ensure
# do not close a second time if we failed the first time
zio.close if zio and not (zio.closed? or gz_error)
body_io.close unless body_io.closed?
end
##
# Decodes a deflate-encoded +body_io+. If it cannot be decoded, raw inflate
# is tried followed by raising an error.
def content_encoding_inflate body_io
log.debug('deflate body') if log
return inflate body_io
rescue Zlib::Error
log.error('unable to inflate response, trying raw deflate') if log
body_io.rewind
begin
return inflate body_io, -Zlib::MAX_WBITS
rescue Zlib::Error => e
log.error("unable to inflate response: #{e}") if log
raise
end
ensure
body_io.close
end
def disable_keep_alive request
request['connection'] = 'close' unless @keep_alive
end
def enable_gzip request
request['accept-encoding'] = if @gzip_enabled
'gzip,deflate,identity'
else
'identity'
end
end
def http_request uri, method, params = nil
case uri.scheme.downcase
when 'http', 'https' then
klass = Net::HTTP.const_get(method.to_s.capitalize)
request ||= klass.new(uri.request_uri)
request.body = params.first if params
request
when 'file' then
Mechanize::FileRequest.new uri
end
end
def request_add_headers request, headers = {}
@request_headers.each do |k,v|
request[k] = v
end
headers.each do |field, value|
case field
when :etag then request["ETag"] = value
when :if_modified_since then request["If-Modified-Since"] = value
when Symbol then
raise ArgumentError, "unknown header symbol #{field}"
else
request[field] = value
end
end
end
def request_auth request, uri
base_uri = uri + '/'
schemes = @authenticate_methods[base_uri]
if realm = schemes[:digest].find { |r| r.uri == base_uri } then
request_auth_digest request, uri, realm, base_uri, false
elsif realm = schemes[:iis_digest].find { |r| r.uri == base_uri } then
request_auth_digest request, uri, realm, base_uri, true
elsif realm = schemes[:basic].find { |r| r.uri == base_uri } then
user, password, = @auth_store.credentials_for uri, realm.realm
request.basic_auth user, password
end
end
def request_auth_digest request, uri, realm, base_uri, iis
challenge = @digest_challenges[realm]
user, password, = @auth_store.credentials_for uri, realm.realm
uri.user = user
uri.password = password
auth = @digest_auth.auth_header uri, challenge.to_s, request.method, iis
request['Authorization'] = auth
end
def request_cookies request, uri
return if @cookie_jar.empty? uri
cookies = @cookie_jar.cookies uri
return if cookies.empty?
request.add_field 'Cookie', cookies.join('; ')
end
def request_host request, uri
port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
host = uri.host
request['Host'] = [host, port].compact.join ':'
end
def request_language_charset request
request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
request['accept-language'] = 'en-us,en;q=0.5'
end
# Log specified headers for the request
def request_log request
return unless log
log.info("#{request.class}: #{request.path}")
request.each_header do |k, v|
log.debug("request-header: #{k} => #{v}")
end
end
# Sets a Referer header. Fragment part is removed as demanded by
# RFC 2616 14.36, and user information part is removed just like
# major browsers do.
def request_referer request, uri, referer
return unless referer
return if 'https'.casecmp(referer.scheme) == 0 and
'https'.casecmp(uri.scheme) != 0
if referer.fragment || referer.user || referer.password
referer = referer.dup
referer.fragment = referer.user = referer.password = nil
end
request['Referer'] = referer
end
def request_user_agent request
request['User-Agent'] = @user_agent if @user_agent
end
def resolve(uri, referer = current_page)
referer_uri = referer && referer.uri
if uri.is_a?(URI)
uri = uri.dup
elsif uri.nil?
if referer_uri
return referer_uri
end
raise ArgumentError, "absolute URL needed (not nil)"
else
url = uri.to_s.strip
if url.empty?
if referer_uri
return referer_uri.dup.tap { |u| u.fragment = nil }
end
raise ArgumentError, "absolute URL needed (not #{uri.inspect})"
end
url.gsub!(/[^#{0.chr}-#{126.chr}]/o) { |match|
if RUBY_VERSION >= "1.9.0"
Mechanize::Util.uri_escape(match)
else
begin
sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C').first)
rescue ArgumentError
# workaround for ruby 1.8 with -Ku but ISO-8859-1 characters in
# URIs. See #227. I can't wait to drop 1.8 support
sprintf('%%%X', match.unpack('C').first)
end
end
}
escaped_url = Mechanize::Util.html_unescape(
url.split(/((?:%[0-9A-Fa-f]{2})+|#)/).each_slice(2).map { |x, y|
"#{WEBrick::HTTPUtils.escape(x)}#{y}"
}.join('')
)
begin
uri = URI.parse(escaped_url)
rescue
uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_url))
end
end
scheme = uri.relative? ? 'relative' : uri.scheme.downcase
uri = @scheme_handlers[scheme].call(uri, referer)
if referer_uri
if uri.path.length == 0 && uri.relative?
uri.path = referer_uri.path
end
end
uri.path = '/' if uri.path.length == 0
if uri.relative?
raise ArgumentError, "absolute URL needed (not #{uri})" unless
referer_uri
if referer.respond_to?(:bases) && referer.parser &&
(lbase = referer.bases.last) && lbase.uri && lbase.uri.absolute?
base = lbase
else
base = nil
end
uri = referer_uri + (base ? base.uri : referer_uri) + uri
# Strip initial "/.." bits from the path
uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
end
unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
raise ArgumentError, "unsupported scheme: #{uri.scheme}"
end
uri
end
def resolve_parameters uri, method, parameters
case method
when :head, :get, :delete, :trace then
if parameters and parameters.length > 0
uri.query ||= ''
uri.query << '&' if uri.query.length > 0
uri.query << Mechanize::Util.build_query_string(parameters)
end
return uri, nil
end
return uri, parameters
end
# :section: Response
def get_meta_refresh response, uri, page
return nil unless @follow_meta_refresh
if page.respond_to?(:meta_refresh) and
(redirect = page.meta_refresh.first) then
[redirect.delay, redirect.href] unless
not @follow_meta_refresh_self and redirect.link_self
elsif refresh = response['refresh']
delay, href, link_self = Mechanize::Page::MetaRefresh.parse refresh, uri
raise Mechanize::Error, 'Invalid refresh http header' unless delay
[delay.to_f, href] unless
not @follow_meta_refresh_self and link_self
end
end
def response_authenticate(response, page, uri, request, headers, params,
referer)
www_authenticate = response['www-authenticate']
unless www_authenticate = response['www-authenticate'] then
message = 'WWW-Authenticate header missing in response'
raise Mechanize::UnauthorizedError.new(page, nil, message)
end
challenges = @authenticate_parser.parse www_authenticate
unless @auth_store.credentials? uri, challenges then
message = "no credentials found, provide some with #add_auth"
raise Mechanize::UnauthorizedError.new(page, challenges, message)
end
if challenge = challenges.find { |c| c.scheme =~ /^Digest$/i } then
realm = challenge.realm uri
auth_scheme = if response['server'] =~ /Microsoft-IIS/ then
:iis_digest
else
:digest
end
existing_realms = @authenticate_methods[realm.uri][auth_scheme]
if existing_realms.include? realm
message = 'Digest authentication failed'
raise Mechanize::UnauthorizedError.new(page, challeges, message)
end
existing_realms << realm
@digest_challenges[realm] = challenge
elsif challenge = challenges.find { |c| c.scheme == 'NTLM' } then
existing_realms = @authenticate_methods[uri + '/'][:ntlm]
if existing_realms.include?(realm) and not challenge.params then
message = 'NTLM authentication failed'
raise Mechanize::UnauthorizedError.new(page, challenges, message)
end
existing_realms << realm
if challenge.params then
type_2 = Net::NTLM::Message.decode64 challenge.params
user, password, domain = @auth_store.credentials_for uri, nil
type_3 = type_2.response({ :user => user, :password => password,
:domain => domain },
{ :ntlmv2 => true }).encode64
headers['Authorization'] = "NTLM #{type_3}"
else
type_1 = Net::NTLM::Message::Type1.new.encode64
headers['Authorization'] = "NTLM #{type_1}"
end
elsif challenge = challenges.find { |c| c.scheme == 'Basic' } then
realm = challenge.realm uri
existing_realms = @authenticate_methods[realm.uri][:basic]
if existing_realms.include? realm then
message = 'Basic authentication failed'
raise Mechanize::UnauthorizedError.new(page, challenges, message)
end
existing_realms << realm
else
message = 'unsupported authentication scheme'
raise Mechanize::UnauthorizedError.new(page, challenges, message)
end
fetch uri, request.method.downcase.to_sym, headers, params, referer
end
def response_content_encoding response, body_io
length = response.content_length ||
case body_io
when Tempfile, IO then
body_io.stat.size
else
body_io.length
end
return body_io if length.zero?
out_io = case response['Content-Encoding']
when nil, 'none', '7bit' then
body_io
when 'deflate' then
content_encoding_inflate body_io
when 'gzip', 'x-gzip' then
content_encoding_gunzip body_io
else
raise Mechanize::Error,
"unsupported content-encoding: #{response['Content-Encoding']}"
end
out_io.flush
out_io.rewind
out_io
rescue Zlib::Error => e
message = "error handling content-encoding #{response['Content-Encoding']}:"
message << " #{e.message} (#{e.class})"
raise Mechanize::Error, message
ensure
begin
if Tempfile === body_io and
(StringIO === out_io or out_io.path != body_io.path) then
body_io.close!
end
rescue IOError
# HACK ruby 1.8 raises IOError when closing the stream
end
end
def response_cookies response, uri, page
if Mechanize::Page === page and page.body =~ /Set-Cookie/n
page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
save_cookies(uri, meta['content'])
end
end
header_cookies = response.get_fields 'Set-Cookie'
return unless header_cookies
header_cookies.each do |set_cookie|
save_cookies(uri, set_cookie)
end
end
def save_cookies(uri, set_cookie)
log = log() # reduce method calls
Mechanize::Cookie.parse(uri, set_cookie, log) { |c|
if @cookie_jar.add(uri, c)
log.debug("saved cookie: #{c}") if log
else
log.debug("rejected cookie: #{c}") if log
end
}
end
def response_follow_meta_refresh response, uri, page, redirects
delay, new_url = get_meta_refresh(response, uri, page)
return nil unless delay
new_url = new_url ? resolve(new_url, page) : uri
raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
redirects + 1 > @redirection_limit
sleep delay
@history.push(page, page.uri)
fetch new_url, :get, {}, [],
Mechanize::Page.new, redirects + 1
end
def response_log response
return unless log
log.info("status: #{response.class} #{response.http_version} " \
"#{response.code} #{response.message}")
response.each_header do |k, v|
log.debug("response-header: #{k} => #{v}")
end
end
def response_parse response, body_io, uri
@context.parse uri, response, body_io
end
def response_read response, request, uri
content_length = response.content_length
if use_tempfile? content_length then
body_io = make_tempfile 'mechanize-raw'
else
body_io = StringIO.new
end
body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
total = 0
begin
response.read_body { |part|
total += part.length
if StringIO === body_io and use_tempfile? total then
new_io = make_tempfile 'mechanize-raw'
new_io.write body_io.string
body_io = new_io
end
body_io.write(part)
log.debug("Read #{part.length} bytes (#{total} total)") if log
}
rescue EOFError => e
# terminating CRLF might be missing, let the user check the document
raise unless response.chunked? and total.nonzero?
body_io.rewind
raise Mechanize::ChunkedTerminationError.new(e, response, body_io, uri,
@context)
rescue Net::HTTP::Persistent::Error => e
body_io.rewind
raise Mechanize::ResponseReadError.new(e, response, body_io, uri,
@context)
end
body_io.flush
body_io.rewind
raise Mechanize::ResponseCodeError.new(response, uri) if
Net::HTTPUnknownResponse === response
content_length = response.content_length
unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
raise EOFError, "Content-Length (#{content_length}) does not match " \
"response body length (#{body_io.length})" if
content_length and content_length != body_io.length
end
body_io
end
def response_redirect(response, method, page, redirects, headers,
referer = current_page)
case @redirect_ok
when true, :all
# shortcut
when false, nil
return page
when :permanent
return page unless Net::HTTPMovedPermanently === response
end
log.info("follow redirect to: #{response['Location']}") if log
raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
redirects + 1 > @redirection_limit
redirect_method = method == :head ? :head : :get
# Make sure we are not copying over the POST headers from the original request
['Content-Length', 'Content-MD5', 'Content-Type'].each do |key|
headers.delete key
end
@history.push(page, page.uri)
new_uri = resolve response['Location'].to_s, page
fetch new_uri, redirect_method, headers, [], referer, redirects + 1
end
# :section: Robots
def get_robots(uri) # :nodoc:
fetch(uri).body
rescue Mechanize::ResponseCodeError => e
return '' if e.response_code == '404'
raise e
end
def robots= value
require 'webrobots' if value
@webrobots = nil if value != @robots
@robots = value
end
##
# Tests if this agent is allowed to access +url+, consulting the site's
# robots.txt.
def robots_allowed? uri
return true if uri.request_uri == '/robots.txt'
webrobots.allowed? uri
end
# Opposite of robots_allowed?
def robots_disallowed? url
!robots_allowed? url
end
# Returns an error object if there is an error in fetching or parsing
# robots.txt of the site +url+.
def robots_error(url)
webrobots.error(url)
end
# Raises the error if there is an error in fetching or parsing robots.txt of
# the site +url+.
def robots_error!(url)
webrobots.error!(url)
end
# Removes robots.txt cache for the site +url+.
def robots_reset(url)
webrobots.reset(url)
end
def webrobots
@webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
end
# :section: SSL
# Path to an OpenSSL CA certificate file
def ca_file
@http.ca_file
end
# Sets the path to an OpenSSL CA certificate file
def ca_file= ca_file
@http.ca_file = ca_file
end
# The SSL certificate store used for validating connections
def cert_store
@http.cert_store
end
# Sets the SSL certificate store used for validating connections
def cert_store= cert_store
@http.cert_store = cert_store
end
# The client X509 certificate
def certificate
@http.certificate
end
# Sets the client certificate to given X509 certificate. If a path is given
# the certificate will be loaded and set.
def certificate= certificate
certificate = if OpenSSL::X509::Certificate === certificate then
certificate
else
OpenSSL::X509::Certificate.new File.read certificate
end
@http.certificate = certificate
end
# An OpenSSL private key or the path to a private key
def private_key
@http.private_key
end
# Sets the client's private key
def private_key= private_key
private_key = if OpenSSL::PKey::PKey === private_key then
private_key
else
OpenSSL::PKey::RSA.new File.read(private_key), @pass
end
@http.private_key = private_key
end
# SSL version to use
def ssl_version
@http.ssl_version
end if RUBY_VERSION > '1.9'
# Sets the SSL version to use
def ssl_version= ssl_version
@http.ssl_version = ssl_version
end if RUBY_VERSION > '1.9'
# A callback for additional certificate verification. See
# OpenSSL::SSL::SSLContext#verify_callback
#
# The callback can be used for debugging or to ignore errors by always
# returning +true+. Specifying nil uses the default method that was valid
# when the SSLContext was created
def verify_callback
@http.verify_callback
end
# Sets the certificate verify callback
def verify_callback= verify_callback
@http.verify_callback = verify_callback
end
# How to verify SSL connections. Defaults to VERIFY_PEER
def verify_mode
@http.verify_mode
end
# Sets the mode for verifying SSL connections
def verify_mode= verify_mode
@http.verify_mode = verify_mode
end
# :section: Timeouts
# Reset connections that have not been used in this many seconds
def idle_timeout
@http.idle_timeout
end
# Sets the connection idle timeout for persistent connections
def idle_timeout= timeout
@http.idle_timeout = timeout
end
# :section: Utility
##
# Creates a new output IO by reading +input_io+ in +read_size+ chunks. If
# the output is over the max_file_buffer size a Tempfile with +name+ is
# created.
#
# If a block is provided, each chunk of +input_io+ is yielded for further
# processing.
def auto_io name, read_size, input_io
out_io = StringIO.new
out_io.set_encoding Encoding::BINARY if out_io.respond_to? :set_encoding
until input_io.eof? do
if StringIO === out_io and use_tempfile? out_io.size then
new_io = make_tempfile name
new_io.write out_io.string
out_io = new_io
end
chunk = input_io.read read_size
chunk = yield chunk if block_given?
out_io.write chunk
end
out_io.rewind
out_io
end
def inflate compressed, window_bits = nil
inflate = Zlib::Inflate.new window_bits
out_io = auto_io 'mechanize-inflate', 1024, compressed do |chunk|
inflate.inflate chunk
end
inflate.finish
out_io
ensure
inflate.close
end
def log
@context.log
end
##
# Sets the proxy address, port, user, and password +addr+ should be a host,
# with no "http://", +port+ may be a port number, service name or port
# number string.
def set_proxy addr, port, user = nil, pass = nil
unless addr and port then
@http.proxy = nil
return
end
unless Integer === port then
begin
port = Socket.getservbyname port
rescue SocketError
begin
port = Integer port
rescue ArgumentError
raise ArgumentError, "invalid value for port: #{port.inspect}"
end
end
end
proxy_uri = URI "http://#{addr}"
proxy_uri.port = port
proxy_uri.user = user if user
proxy_uri.password = pass if pass
@http.proxy = proxy_uri
end
def make_tempfile name
io = Tempfile.new name
io.unlink
io.binmode if io.respond_to? :binmode
io
end
def use_tempfile? size
return false unless @max_file_buffer
return false unless size
size >= @max_file_buffer
end
end
require 'mechanize/http/auth_store'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment