Python program to download a complete directory or file from Microsoft Azure ADLS. This program is capable of recursively download a complete directory from Azure Data Lake Storage. This uses Azure Blob Storage API to iterate over the directories, files and download the data. This is tested as of 05-October-2020. For more details, refer to https…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
from azure.storage.blob import BlobServiceClient | |
class DownloadADLS: | |
def __init__(self, connection_string, container_name): | |
service_client = BlobServiceClient.from_connection_string(connection_string) | |
self.client = service_client.get_container_client(container_name) | |
def download(self, source, dest): | |
''' | |
Download a file or directory to a path on the local filesystem | |
''' | |
if not dest: | |
raise Exception('A destination must be provided') | |
blobs = self.ls_files(source, recursive=True) | |
if blobs: | |
# if source is a directory, dest must also be a directory | |
if not source == '' and not source.endswith('/'): | |
source += '/' | |
if not dest.endswith('/'): | |
dest += '/' | |
# append the directory name from source to the destination | |
dest += os.path.basename(os.path.normpath(source)) + '/' | |
blobs = [source + blob for blob in blobs] | |
for blob in blobs: | |
blob_dest = dest + os.path.relpath(blob, source) | |
self.download_file(blob, blob_dest) | |
else: | |
self.download_file(source, dest) | |
def download_file(self, source, dest): | |
''' | |
Download a single file to a path on the local filesystem | |
''' | |
# dest is a directory if ending with '/' or '.', otherwise it's a file | |
if dest.endswith('.'): | |
dest += '/' | |
blob_dest = dest + os.path.basename(source) if dest.endswith('/') else dest | |
print(f'Downloading {source} to {blob_dest}') | |
os.makedirs(os.path.dirname(blob_dest), exist_ok=True) | |
bc = self.client.get_blob_client(blob=source) | |
with open(blob_dest, 'wb') as file: | |
data = bc.download_blob() | |
file.write(data.readall()) | |
def ls_files(self, path, recursive=False): | |
''' | |
List files under a path, optionally recursively | |
''' | |
if not path == '' and not path.endswith('/'): | |
path += '/' | |
blob_iter = self.client.list_blobs(name_starts_with=path) | |
files = [] | |
for blob in blob_iter: | |
relative_path = os.path.relpath(blob.name, path) | |
if recursive or not '/' in relative_path: | |
files.append(relative_path) | |
return files | |
def ls_dirs(self, path, recursive=False): | |
''' | |
List directories under a path, optionally recursively | |
''' | |
if not path == '' and not path.endswith('/'): | |
path += '/' | |
blob_iter = self.client.list_blobs(name_starts_with=path) | |
dirs = [] | |
for blob in blob_iter: | |
relative_dir = os.path.dirname(os.path.relpath(blob.name, path)) | |
if relative_dir and (recursive or not '/' in relative_dir) and not relative_dir in dirs: | |
dirs.append(relative_dir) | |
return dirs | |
if __name__ == '__main__': | |
CONNECTION_STRING = "" | |
CONTAINER_NAME = "" | |
client = DownloadADLS(CONNECTION_STRING, CONTAINER_NAME) | |
client.download(source="", dest="") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment