Skip to content

Instantly share code, notes, and snippets.

@Menziess
Last active March 30, 2024 13:23
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Menziess/bfcbea6a309e0990e8c296ce23125059 to your computer and use it in GitHub Desktop.
Save Menziess/bfcbea6a309e0990e8c296ce23125059 to your computer and use it in GitHub Desktop.
def deep_ls(path: str, max_depth=1, reverse=False, key=None, keep_hidden=False):
"""List all files in base path recursively.
List all files and folders in specified path and subfolders within maximum recursion depth.
Parameters
----------
path : str
The path of the folder from which files are listed
max_depth : int
The maximum recursion depth
reverse : bool
As used in `sorted([1, 2], reverse=True)`
key : Callable
As used in `sorted(['aa', 'aaa'], key=len)`
keep_hidden : bool
Keep files and folders starting with '_' or '.'
Examples
--------
>>> from pprint import pprint
>>> files = list(deep_ls('/databricks-datasets/asa/airlines'))
>>> pprint(files) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
[FileInfo(path='dbfs:/databricks-datasets/asa/airlines/1987.csv', name='1987.csv', size=127162942),
...
FileInfo(path='dbfs:/databricks-datasets/asa/airlines/2008.csv', name='2008.csv', size=689413344)]
>>> first, *_, last = files
>>> first
FileInfo(path='dbfs:/databricks-datasets/asa/airlines/1987.csv', name='1987.csv', size=127162942)
>>> last
FileInfo(path='dbfs:/databricks-datasets/asa/airlines/2008.csv', name='2008.csv', size=689413344)
"""
# Hidden files may be filtered out
condition = None if keep_hidden else lambda x: x.name[0] not in ('_', '.')
# List all files in path and apply sorting rules
li = sorted(filter(condition, dbutils.fs.ls(path)),
reverse=reverse, key=key)
# Return all files (not ending with '/')
for x in li:
if x.path[-1] is not '/':
yield x
# If the max_depth has not been reached, start
# listing files and folders in subdirectories
if max_depth > 1:
for x in li:
if x.path[-1] is not '/':
continue
for y in deep_ls(x.path, max_depth - 1, reverse, key, keep_hidden):
yield y
# If max_depth has been reached,
# return the folders
else:
for x in li:
if x.path[-1] is '/':
yield x
def key(val):
"""Sort function.
Takes a filepath:
'/mnt/raw/store/item/year=2019/month=6/day=4/'
Extracts the integer 4 or returns -1
"""
try:
return int(list(filter(bool, val.path.split('/'))).pop().split('=').pop())
except ValueError as e:
return -1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment