Last active
May 21, 2017 17:17
-
-
Save jaytaylor/95551de799371eb87fdc048012f4cecb to your computer and use it in GitHub Desktop.
Finds directories under filesystem path which share a leading first token.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import glob, os, sys | |
def first_occurrence_of(s, delimiters): | |
delim_indexes = [pos for pos in map(s.find, delimiters) if pos != -1] | |
if len(delim_indexes) == 0: | |
return -1 | |
first_index = min(delim_indexes) | |
return first_index | |
def gen_token_item_tuples(items, index_fn, exclude_index_values): | |
tuples = [] | |
for item in items: | |
index = index_fn(item) | |
if index not in exclude_index_values: | |
tuples.append((item[0:index], item)) | |
return tuples | |
def find_dupe_prefixes(pathname, delimiters=('.', '_', ' ')): | |
""" | |
Finds directories under pathname which share a leading first token. | |
Big-O Analysis | |
-------------- | |
Computational complexity: | |
tokenization of directories = n | |
timsort = n log n | |
for-loop = n | |
-> O(n log n) | |
Space complexity: | |
timsort = n | |
for-loop = 1 | |
-> O(n) | |
""" | |
directories = os.listdir(pathname) | |
token_directory_tuples = gen_token_item_tuples(directories, lambda s: first_occurrence_of(s, delimiters), (0, -1)) | |
#print '\n'.join(map(lambda t: '%s | %s' % (t[1].index(t[0]), t,), token_directories)) | |
sorted_token_directory_tuples = sorted(token_directory_tuples, key=lambda tup: tup[0] + tup[1][len(tup[0]) + 1:]) | |
#print '\n'.join(map(lambda t: '%s | %s' % ('hi', t,), sorted_token_directory_tuples)) | |
#sys.exit(1) | |
found_dupes = [] | |
prev_token, prev_prev_token = ('', '') | |
prev_directory = '' | |
for (curr_token, curr_directory) in sorted_token_directory_tuples: | |
first_delim_idx = first_occurrence_of(curr_directory, delimiters) | |
if first_delim_idx in (0, -1): | |
continue | |
#curr_token = curr_directory[0:first_delim_idx].lower() | |
curr_token_lower = curr_token.lower() | |
if curr_token_lower == prev_token: | |
if prev_token == prev_prev_token: | |
found_dupes[-1].append(curr_directory) | |
else: | |
found_dupes.append([prev_directory, curr_directory]) | |
prev_prev_token = prev_token | |
prev_token = curr_token_lower | |
prev_directory = curr_directory | |
return found_dupes | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
sys.stderr.write('error: missing required argument: [path]\n') | |
sys.exit(1) | |
pathname = sys.argv[1] | |
if not os.path.exists(pathname): | |
sys.stderr.write('error: path "%s" does not exist\n' % (pathname,)) | |
sys.exit(1) | |
if not os.path.isdir(pathname): | |
sys.stderr.write('error: path "%s" is not a directory\n' % (pathname,)) | |
sys.exit(1) | |
found_dupes = find_dupe_prefixes(pathname) | |
for items in found_dupes: | |
print('\n'.join(items)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment