Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Recursively listing Data Lake files with `display` implemented
def deep_ls(path: str, max_depth=1):
"""
List all files and folders in specified path and
subfolders within maximum recursion depth.
"""
# List all files in path and apply sorting rules
li = mssparkutils.fs.ls(path)
# Return all files
for x in li:
if x.size != 0:
yield x
# If the max_depth has not been reached, start
# listing files and folders in subdirectories
if max_depth > 1:
for x in li:
if x.size != 0:
continue
for y in deep_ls(x.path, max_depth - 1):
yield y
# If max_depth has been reached,
# return the folders
else:
for x in li:
if x.size == 0:
yield x
def convertfiles2df(files):
"""
Converts FileInfo object into Pandas DataFrame to enable display
"""
# Disable Arrow-based transfers since the Pandas DataFrame is tiny
spark.conf.set("spark.sql.execution.arrow.enabled", "false")
schema = ['path','name','size']
df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
return(df)
# Example Implementation
# ----------------------
import pandas as pd
from notebookutils import mssparkutils
# Azure storage access info
adls_account_name = 'your-account-name'
adls_container_name = 'your-container-name'
linked_service_name = 'adls-linked-service-name-in-synapse'
# Grab SAS token
adls_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
# Configure Spark to access from DFS endpoint
root = 'abfss://%s@%s.dfs.core.windows.net/' % (adls_container_name, adls_account_name)
spark.conf.set('fs.azure.sas.%s.%s.dfs.core.windows.net' % (adls_container_name, adls_account_name), adls_sas_token)
# Get files
files = list(deep_ls(root, max_depth=20))
# Display with Pretty Printing
display(convertfiles2df(files))
# Pretty Printing works with default ls as well
display(convertfiles2df(mssparkutils.fs.ls(root)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment