Skip to content

Instantly share code, notes, and snippets.

@alexwlchan
Last active August 13, 2016 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexwlchan/44f6b99d7b9c4d2516070130805cd3b4 to your computer and use it in GitHub Desktop.
Save alexwlchan/44f6b99d7b9c4d2516070130805cd3b4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
download_files.py
~~~~~~~~~~~~~~~~~
This is a file downloader for Python with a few special features:
* If the user doesn't supply a filename, try to infer a filename and
file extension from the URL.
* Don't overwrite an existing file of the same name.
This requires Django, Werkzeug and Requests. Written for python 3.
"""
import os
import urllib.request
from django.core.files.storage import Storage
from django.core.validators import URLValidator
import requests
from werkzeug.utils import secure_filename
class OSStorage(Storage):
def exists(self, name):
return os.path.exists(name)
def _get_filename_from_url(url):
"""
Given a URL to download, guess a sensible filename.
"""
# Get an initial filename to use for the saved file, which is safe
# from malicious user input or weird URLs.
filename = os.path.basename(urllib.parse.urlparse(url).path)
filename = secure_filename(filename)
# If the filename doesn't have an extension, add one now. Make a
# HEAD request, and guess a file extension based on the Content-Type
# header of the response.
_, ext = os.path.splitext(filename)
if not ext:
req = requests.head(url)
content_type = req.headers.get('Content-Type', None)
# Assuming we got a non-empty Content-Type header, guess an
# extension.
if content_type == 'image/jpeg':
extension = '.jpeg'
elif content_type:
extension = mimetypes.guess_extension(content_type)
filename += extension
return filename
def download_from_url(url, *, filename=None, download_dir=None):
"""
Download a URL to a given filename. If ``filename`` is None, infer
a filename from the URL. Returns a path to the downloaded file.
"""
if filename is None:
filename = _get_filename_from_url(url)
# If the user wants to download to something that isn't the current
# directory, create it first.
if download_dir is not None:
os.makedirs(download_dir, exist_ok=True)
filename = os.path.join(download_dir, filename)
# Make sure this filename is unique on the local system.
s = OSStorage()
filename = s.get_available_name(name=filename)
# Finally, download the file.
urllib.request.urlretrieve(url, filename)
return filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment