Skip to content

Instantly share code, notes, and snippets.

@amalgjose
Last active December 4, 2023 16:15
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save amalgjose/7de7b93a326e5a6f53f8b43ba5187932 to your computer and use it in GitHub Desktop.
Save amalgjose/7de7b93a326e5a6f53f8b43ba5187932 to your computer and use it in GitHub Desktop.
Python program to download a complete directory or file from Microsoft Azure ADLS. This program is capable of recursively download a complete directory from Azure Data Lake Storage. This uses Azure Blob Storage API to iterate over the directories, files and download the data. This is tested as of 05-October-2020. For more details, refer to https…
# coding: utf-8
import os
from azure.storage.blob import BlobServiceClient
class DownloadADLS:
def __init__(self, connection_string, container_name):
service_client = BlobServiceClient.from_connection_string(connection_string)
self.client = service_client.get_container_client(container_name)
def download(self, source, dest):
'''
Download a file or directory to a path on the local filesystem
'''
if not dest:
raise Exception('A destination must be provided')
blobs = self.ls_files(source, recursive=True)
if blobs:
# if source is a directory, dest must also be a directory
if not source == '' and not source.endswith('/'):
source += '/'
if not dest.endswith('/'):
dest += '/'
# append the directory name from source to the destination
dest += os.path.basename(os.path.normpath(source)) + '/'
blobs = [source + blob for blob in blobs]
for blob in blobs:
blob_dest = dest + os.path.relpath(blob, source)
self.download_file(blob, blob_dest)
else:
self.download_file(source, dest)
def download_file(self, source, dest):
'''
Download a single file to a path on the local filesystem
'''
# dest is a directory if ending with '/' or '.', otherwise it's a file
if dest.endswith('.'):
dest += '/'
blob_dest = dest + os.path.basename(source) if dest.endswith('/') else dest
print(f'Downloading {source} to {blob_dest}')
os.makedirs(os.path.dirname(blob_dest), exist_ok=True)
bc = self.client.get_blob_client(blob=source)
with open(blob_dest, 'wb') as file:
data = bc.download_blob()
file.write(data.readall())
def ls_files(self, path, recursive=False):
'''
List files under a path, optionally recursively
'''
if not path == '' and not path.endswith('/'):
path += '/'
blob_iter = self.client.list_blobs(name_starts_with=path)
files = []
for blob in blob_iter:
relative_path = os.path.relpath(blob.name, path)
if recursive or not '/' in relative_path:
files.append(relative_path)
return files
def ls_dirs(self, path, recursive=False):
'''
List directories under a path, optionally recursively
'''
if not path == '' and not path.endswith('/'):
path += '/'
blob_iter = self.client.list_blobs(name_starts_with=path)
dirs = []
for blob in blob_iter:
relative_dir = os.path.dirname(os.path.relpath(blob.name, path))
if relative_dir and (recursive or not '/' in relative_dir) and not relative_dir in dirs:
dirs.append(relative_dir)
return dirs
if __name__ == '__main__':
CONNECTION_STRING = ""
CONTAINER_NAME = ""
client = DownloadADLS(CONNECTION_STRING, CONTAINER_NAME)
client.download(source="", dest="")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment