-
-
Save mdrakiburrahman/dade03380b8907c85341ece18c795e4e to your computer and use it in GitHub Desktop.
def deep_ls(path: str, max_depth=1): | |
""" | |
List all files and folders in specified path and | |
subfolders within maximum recursion depth. | |
""" | |
# List all files in path and apply sorting rules | |
li = mssparkutils.fs.ls(path) | |
# Return all files | |
for x in li: | |
if x.size != 0: | |
yield x | |
# If the max_depth has not been reached, start | |
# listing files and folders in subdirectories | |
if max_depth > 1: | |
for x in li: | |
if x.size != 0: | |
continue | |
for y in deep_ls(x.path, max_depth - 1): | |
yield y | |
# If max_depth has been reached, | |
# return the folders | |
else: | |
for x in li: | |
if x.size == 0: | |
yield x | |
def convertfiles2df(files): | |
""" | |
Converts FileInfo object into Pandas DataFrame to enable display | |
""" | |
# Disable Arrow-based transfers since the Pandas DataFrame is tiny | |
spark.conf.set("spark.sql.execution.arrow.enabled", "false") | |
schema = ['path','name','size'] | |
df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path') | |
return(df) | |
# Example Implementation | |
# ---------------------- | |
import pandas as pd | |
from notebookutils import mssparkutils | |
# Azure storage access info | |
adls_account_name = 'your-account-name' | |
adls_container_name = 'your-container-name' | |
linked_service_name = 'adls-linked-service-name-in-synapse' | |
# Grab SAS token | |
adls_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name) | |
# Configure Spark to access from DFS endpoint | |
root = 'abfss://%s@%s.dfs.core.windows.net/' % (adls_container_name, adls_account_name) | |
spark.conf.set('fs.azure.sas.%s.%s.dfs.core.windows.net' % (adls_container_name, adls_account_name), adls_sas_token) | |
# Get files | |
files = list(deep_ls(root, max_depth=20)) | |
# Display with Pretty Printing | |
display(convertfiles2df(files)) | |
# Pretty Printing works with default ls as well | |
display(convertfiles2df(mssparkutils.fs.ls(root))) |
This is brilliant! Something I've been struggling with.
+1
Glad it helped!
This is perfect. Thank you!
Thank you! This helped.
Are the 'if x.size == 0' lines solely for differentiating between files and folders? If so, each FileInfo object in the list returned by mssparkutils.fs.ls() has a bunch of "hidden" attributes (i.e. not revealed by print(FileInfo)), including 'isDir' (boolean) which indicates whether or not the item is a directory. If it's possible to have files with sizes of zero, then this would be more reliable. Discovered that here.
@mdrakiburrahman I noticed in your article where you linked this code, that it is not a good idea to run recursion on a Production Data Lake with a large number of small files. Do you have a recommendation for an alternative approach? I am trying to accomplish what you code does, but on a large scale. Any help is greatly appreciated!
This is brilliant! Something I've been struggling with.