mdrakiburrahman/list_synapse_files_recursively.py

## list_synapse_files_recursively.py
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path and apply sorting rules
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

def convertfiles2df(files):
    """
    Converts FileInfo object into Pandas DataFrame to enable display
    """
    # Disable Arrow-based transfers since the Pandas DataFrame is tiny
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")

    schema = ['path','name','size']
    df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
    return(df)

# Example Implementation
# ----------------------
import pandas as pd
from notebookutils import mssparkutils

# Azure storage access info
adls_account_name = 'your-account-name'
adls_container_name = 'your-container-name'
linked_service_name = 'adls-linked-service-name-in-synapse'

# Grab SAS token
adls_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)

# Configure Spark to access from DFS endpoint
root = 'abfss://%s@%s.dfs.core.windows.net/' % (adls_container_name, adls_account_name)
spark.conf.set('fs.azure.sas.%s.%s.dfs.core.windows.net' % (adls_container_name, adls_account_name), adls_sas_token)

# Get files
files = list(deep_ls(root, max_depth=20))

# Display with Pretty Printing
display(convertfiles2df(files))

# Pretty Printing works with default ls as well
display(convertfiles2df(mssparkutils.fs.ls(root)))
	def deep_ls(path: str, max_depth=1):
	"""
	List all files and folders in specified path and
	subfolders within maximum recursion depth.
	"""

	# List all files in path and apply sorting rules
	li = mssparkutils.fs.ls(path)

	# Return all files
	for x in li:
	if x.size != 0:
	yield x

	# If the max_depth has not been reached, start
	# listing files and folders in subdirectories
	if max_depth > 1:
	for x in li:
	if x.size != 0:
	continue
	for y in deep_ls(x.path, max_depth - 1):
	yield y

	# If max_depth has been reached,
	# return the folders
	else:
	for x in li:
	if x.size == 0:
	yield x

	def convertfiles2df(files):
	"""
	Converts FileInfo object into Pandas DataFrame to enable display
	"""
	# Disable Arrow-based transfers since the Pandas DataFrame is tiny
	spark.conf.set("spark.sql.execution.arrow.enabled", "false")

	schema = ['path','name','size']
	df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
	return(df)

	# Example Implementation
	# ----------------------
	import pandas as pd
	from notebookutils import mssparkutils

	# Azure storage access info
	adls_account_name = 'your-account-name'
	adls_container_name = 'your-container-name'
	linked_service_name = 'adls-linked-service-name-in-synapse'

	# Grab SAS token
	adls_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)

	# Configure Spark to access from DFS endpoint
	root = 'abfss://%s@%s.dfs.core.windows.net/' % (adls_container_name, adls_account_name)
	spark.conf.set('fs.azure.sas.%s.%s.dfs.core.windows.net' % (adls_container_name, adls_account_name), adls_sas_token)

	# Get files
	files = list(deep_ls(root, max_depth=20))

	# Display with Pretty Printing
	display(convertfiles2df(files))

	# Pretty Printing works with default ls as well
	display(convertfiles2df(mssparkutils.fs.ls(root)))