Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A safe way in Ruby to download a file to disk using open-uri (with/without comments)
require "open-uri"
require "net/http"
Error = Class.new(StandardError)
DOWNLOAD_ERRORS = [
SocketError,
OpenURI::HTTPError,
RuntimeError,
URI::InvalidURIError,
Error,
]
def download(url, max_size: nil)
url = URI.encode(URI.decode(url))
url = URI(url)
raise Error, "url was invalid" if !url.respond_to?(:open)
options = {}
options["User-Agent"] = "MyApp/1.2.3"
options[:content_length_proc] = ->(size) {
if max_size && size && size > max_size
raise Error, "file is too big (max is #{max_size})"
end
}
downloaded_file = url.open(options)
if downloaded_file.is_a?(StringIO)
tempfile = Tempfile.new("open-uri", binmode: true)
IO.copy_stream(downloaded_file, tempfile.path)
downloaded_file = tempfile
OpenURI::Meta.init downloaded_file, stringio
end
downloaded_file
rescue *DOWNLOAD_ERRORS => error
raise if error.instance_of?(RuntimeError) && error.message !~ /redirection/
raise Error, "download failed (#{url}): #{error.message}"
end
require "open-uri"
require "net/http" # Just to get the SocketError class
Error = Class.new(StandardError)
DOWNLOAD_ERRORS = [
SocketError, # domain not found
OpenURI::HTTPError, # response status 4xx or 5xx
RuntimeError, # redirection errors (e.g. redirection loop)
URI::InvalidURIError, # invalid URL
Error, # our errors
]
def download(url, max_size: nil)
# URLs with spaces will raise an InvalidURIError, so we need to encode it.
# However, the user can pass an already encoded URL, so we first need to
# decode it.
url = URI.encode(URI.decode(url))
# This will raise an InvalidURIError if the URL is very wrong. It will still
# pass for strings like "foo", though.
url = URI(url)
# We need to check if the URL was either http://, https:// or ftp://, because
# these are the only ones we can download from. open-uri will add the #open
# method only to these ones, so this is a good check.
raise Error, "url was invalid" if !url.respond_to?(:open)
options = {}
# It was shown that in a random sample approximately 20% of websites will
# simply refuse a request which doesn't have a valid User-Agent.
options["User-Agent"] = "MyApp/1.2.3"
# It's good to shield ourselves from files that are too big. open-uri will
# call this block as soon as it gets the "Content-Length" header, which means
# that we can bail out before we download the file.
options[:content_length_proc] = ->(size) {
if max_size && size && size > max_size # sometimes "Content-Length" can be empty
raise Error, "file is too big (max is #{max_size})"
end
}
# Finally we download the file. Here we mustn't use simple #open that open-uri
# overrides, because this is vulnerable to shell execution attack (if #open
# method detects a starting pipe (e.g. "| ls"), it will execute the following
# as a shell command).
downloaded_file = url.open(options)
# open-uri will return a StringIO instead of a Tempfile if the filesize
# is less than 10 KB, so we patch this behaviour by converting it into a
# Tempfile.
if downloaded_file.is_a?(StringIO)
# We need to open it in binary mode for Windows users.
tempfile = Tempfile.new("open-uri", binmode: true)
# IO.copy_stream is the most efficient way of data transfer.
IO.copy_stream(downloaded_file, tempfile.path)
downloaded_file = tempfile
# We add the metadata that open-uri puts on the file (e.g. #content_type)
OpenURI::Meta.init downloaded_file, stringio
end
downloaded_file # Finally
rescue *DOWNLOAD_ERRORS => error
# open-uri will throw a RuntimeError when it detects a redirection loop, so
# we want to reraise the exception if it was some other RuntimeError
raise if error.instance_of?(RuntimeError) && error.message !~ /redirection/
# We raise our unified Error class
raise Error, "download failed (#{url}): #{error.message}"
end
@alem0lars-yr

This comment has been minimized.

Copy link

alem0lars-yr commented Feb 3, 2017

Thanks!

@ieglonew01f

This comment has been minimized.

Copy link

ieglonew01f commented Apr 19, 2017

basename is undefined ! and also stringio ! any help ?

@paneq

This comment has been minimized.

Copy link

paneq commented Jul 5, 2018

I can see that url.open downloads the whole file to memory first.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.