Last active
August 13, 2016 19:03
-
-
Save alexwlchan/44f6b99d7b9c4d2516070130805cd3b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- encoding: utf-8 -*- | |
""" | |
download_files.py | |
~~~~~~~~~~~~~~~~~ | |
This is a file downloader for Python with a few special features: | |
* If the user doesn't supply a filename, try to infer a filename and | |
file extension from the URL. | |
* Don't overwrite an existing file of the same name. | |
This requires Django, Werkzeug and Requests. Written for python 3. | |
""" | |
import os | |
import urllib.request | |
from django.core.files.storage import Storage | |
from django.core.validators import URLValidator | |
import requests | |
from werkzeug.utils import secure_filename | |
class OSStorage(Storage): | |
def exists(self, name): | |
return os.path.exists(name) | |
def _get_filename_from_url(url): | |
""" | |
Given a URL to download, guess a sensible filename. | |
""" | |
# Get an initial filename to use for the saved file, which is safe | |
# from malicious user input or weird URLs. | |
filename = os.path.basename(urllib.parse.urlparse(url).path) | |
filename = secure_filename(filename) | |
# If the filename doesn't have an extension, add one now. Make a | |
# HEAD request, and guess a file extension based on the Content-Type | |
# header of the response. | |
_, ext = os.path.splitext(filename) | |
if not ext: | |
req = requests.head(url) | |
content_type = req.headers.get('Content-Type', None) | |
# Assuming we got a non-empty Content-Type header, guess an | |
# extension. | |
if content_type == 'image/jpeg': | |
extension = '.jpeg' | |
elif content_type: | |
extension = mimetypes.guess_extension(content_type) | |
filename += extension | |
return filename | |
def download_from_url(url, *, filename=None, download_dir=None): | |
""" | |
Download a URL to a given filename. If ``filename`` is None, infer | |
a filename from the URL. Returns a path to the downloaded file. | |
""" | |
if filename is None: | |
filename = _get_filename_from_url(url) | |
# If the user wants to download to something that isn't the current | |
# directory, create it first. | |
if download_dir is not None: | |
os.makedirs(download_dir, exist_ok=True) | |
filename = os.path.join(download_dir, filename) | |
# Make sure this filename is unique on the local system. | |
s = OSStorage() | |
filename = s.get_available_name(name=filename) | |
# Finally, download the file. | |
urllib.request.urlretrieve(url, filename) | |
return filename |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment