Skip to content

Instantly share code, notes, and snippets.

@thraxil
Created November 17, 2014 22:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thraxil/785531ebff50c68180a4 to your computer and use it in GitHub Desktop.
Save thraxil/785531ebff50c68180a4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import sys
class TrieNode(object):
def __init__(self):
self.children = dict()
self.count = 0
class Trie(object):
def __init__(self):
self._root = TrieNode()
def insert(self, s):
if len(s) < 1:
return
c = s[0]
rest = s[1:]
self._insert(self._root, s, c, rest)
def _insert(self, n, s, c, rest):
if not c:
return
if c not in n.children:
n.children[c] = TrieNode()
x = n.children[c]
n.count += 1
if rest:
c2 = rest[0]
rest = rest[1:]
self._insert(x, s, c2, rest)
def find_deepest_n(self, c):
""" find the deepest node in the tree with a count >= c """
matches = self._dfs(self._root, c, "", [])
matches.sort(key=lambda x: len(x))
return matches
def _dfs(self, n, c, prefix, matches):
if n.count < c:
return matches
matches.append(prefix)
for k, v in n.children.items():
matches = self._dfs(v, c, prefix + k, matches)
return matches
if __name__ == "__main__":
t = Trie()
for f in os.listdir("."):
t.insert(f)
c = sys.argv[1]
prefixes = t.find_deepest_n(int(c))
for prefix in prefixes:
print prefix
@thraxil
Copy link
Author

thraxil commented Nov 17, 2014

You've got a giant directory full of files that someone has dumped in there. Many of them are related to each other and come in little series. Looking at it visually, it's easy to see patterns like 'file001.jpg ... file044.jpg' of common prefixes. This is a tool for helping you automatically identify those common prefixes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment