Skip to content

Instantly share code, notes, and snippets.

@jaytaylor
Last active May 21, 2017 17:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaytaylor/95551de799371eb87fdc048012f4cecb to your computer and use it in GitHub Desktop.
Save jaytaylor/95551de799371eb87fdc048012f4cecb to your computer and use it in GitHub Desktop.
Finds directories under filesystem path which share a leading first token.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import glob, os, sys
def first_occurrence_of(s, delimiters):
delim_indexes = [pos for pos in map(s.find, delimiters) if pos != -1]
if len(delim_indexes) == 0:
return -1
first_index = min(delim_indexes)
return first_index
def gen_token_item_tuples(items, index_fn, exclude_index_values):
tuples = []
for item in items:
index = index_fn(item)
if index not in exclude_index_values:
tuples.append((item[0:index], item))
return tuples
def find_dupe_prefixes(pathname, delimiters=('.', '_', ' ')):
"""
Finds directories under pathname which share a leading first token.
Big-O Analysis
--------------
Computational complexity:
tokenization of directories = n
timsort = n log n
for-loop = n
-> O(n log n)
Space complexity:
timsort = n
for-loop = 1
-> O(n)
"""
directories = os.listdir(pathname)
token_directory_tuples = gen_token_item_tuples(directories, lambda s: first_occurrence_of(s, delimiters), (0, -1))
#print '\n'.join(map(lambda t: '%s | %s' % (t[1].index(t[0]), t,), token_directories))
sorted_token_directory_tuples = sorted(token_directory_tuples, key=lambda tup: tup[0] + tup[1][len(tup[0]) + 1:])
#print '\n'.join(map(lambda t: '%s | %s' % ('hi', t,), sorted_token_directory_tuples))
#sys.exit(1)
found_dupes = []
prev_token, prev_prev_token = ('', '')
prev_directory = ''
for (curr_token, curr_directory) in sorted_token_directory_tuples:
first_delim_idx = first_occurrence_of(curr_directory, delimiters)
if first_delim_idx in (0, -1):
continue
#curr_token = curr_directory[0:first_delim_idx].lower()
curr_token_lower = curr_token.lower()
if curr_token_lower == prev_token:
if prev_token == prev_prev_token:
found_dupes[-1].append(curr_directory)
else:
found_dupes.append([prev_directory, curr_directory])
prev_prev_token = prev_token
prev_token = curr_token_lower
prev_directory = curr_directory
return found_dupes
if __name__ == '__main__':
if len(sys.argv) < 2:
sys.stderr.write('error: missing required argument: [path]\n')
sys.exit(1)
pathname = sys.argv[1]
if not os.path.exists(pathname):
sys.stderr.write('error: path "%s" does not exist\n' % (pathname,))
sys.exit(1)
if not os.path.isdir(pathname):
sys.stderr.write('error: path "%s" is not a directory\n' % (pathname,))
sys.exit(1)
found_dupes = find_dupe_prefixes(pathname)
for items in found_dupes:
print('\n'.join(items))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment