Skip to content

Instantly share code, notes, and snippets.

@fijiaaron
Last active May 4, 2022 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fijiaaron/9cc5bf4c822a169144473f2076becc17 to your computer and use it in GitHub Desktop.
Save fijiaaron/9cc5bf4c822a169144473f2076becc17 to your computer and use it in GitHub Desktop.
## check_for_distinct_paths.py
"""
Given a list of paths, return a list of distinct paths:
If path1 is the same as path2, they are considered duplicates.
For example:
>>> get_distinct_paths(['abc/def', 'abc/def'])
['abc/def']
If path1 is a prefix of path2, then path1 is considered a duplicate of path2.
For example:
>>> get_distinct_paths(['abc', 'abc/def'])
['abc/def']
>>> get_distinct_paths(['abc', 'abc/def', 'abc/def/ghi', 'abc/def/xyz'])
['abc/def/ghi', 'abc/def/xyz']
Search all paths and eliminate all that are prefixes of others and return the list of distinct paths.
For example:
>>> get_distinct_paths(['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def'])
['abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']
We are essentially finding potential files, but eliminating potential directories that contain those files.
"""
def get_distinct_paths(list_of_paths):
## split paths into parts so we can compare lists
split_paths = [list(path.split("/")) for path in list_of_paths]
# print('split_paths', split_paths)
## sort paths so that we can more easily check shortest to longest
## (this is not needed)
# sorted_paths = sorted(split_paths, key=len)
# print('sorted_paths', sorted_paths)
## just assign back to "paths" for simpler reading (even though what we have now is a sorted list of lists)
paths = split_paths
# paths = sorted_paths
## keep track of distict paths and those that are just partials (i.e. directories, not files)
distincts = []
partials = []
## check each path to see if it is part of any of the other paths
for i, path in enumerate(paths):
distinct = True
# print('path', i, path)
## compare it against each remaining path here
## (one thing that tripped me up was that `split_paths.pop(0)` will alter the original list, because it's a reference)
for j, rpath in enumerate(paths[i+1:]):
# print(f'\t checking if path {path} is in {rpath}')
## only need to see if the list of elements in the path we are checking matches the first len() elements in the other paths
if path == rpath[:len(path)]:
## if it does, we know it's not distinct and can move on
distinct = False
# print('\t\t partial found', path)
## convert path back into a string
path = '/'.join(path)
## add to list of partial paths
partials.append(path)
break
if distinct:
# print('\t\t distinct found', path)
## convert path back into a string
path = '/'.join(path)
## add to list of distinct paths
distincts.append(path)
return distincts
## run doctest by default when invoked directly
## (typically you would import like so:
## `from check_for_distinct_paths import get_distinct_paths`
if __name__ == '__main__':
import doctest
doctest.testmod()
## run a test and verify expected result
paths = ['abc', 'abc/def/ghi', 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def']
expected = [ 'abc/def/ghi/jkl', 'abc/def/ghijkl/jkl', 'def' ]
distinct_paths = get_distinct_paths(paths)
print('distinct_paths', distinct_paths)
## convert list to a set for easier comparison
assert set(distinct_paths) == set(expected)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment