Skip to content

Instantly share code, notes, and snippets.

@cg-soft
Last active May 3, 2017 22:41
Show Gist options
  • Save cg-soft/f89cc18984e970d62270a7504834c5f3 to your computer and use it in GitHub Desktop.
Save cg-soft/f89cc18984e970d62270a7504834c5f3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2.7
# Filter a set of file paths using wildcarded inclusion and exclusion rules
#
# Usage: See tests below
#
# Restrictions:
# File paths are assumed to be relative paths. Leading slashes are ignored.
# File names may not contain wildcards.
# Supported wildcards:
# "..." matches zero or more complete directory entries. As such, it is
# only valid within slashes, as in .../a/b, a/.../b or such.
# "*" matches zero or more characters within a directory entry.
# "[x]" matches enclosed characters, for example [a-z], [xy], [^x] (not x)...
# No distinction is made between files and directories. If you want a rule to select
# only a directory, you need to end it with "/*".
# Trailing "/" or "/..." in a rule have no effect.
# Inclusions and exclusions are merged, in the sense that whichever rule matches
# the entry closest to the end of the path wins. For example, if you have three rules:
# addInclude('a/b')
# addExclude('a/b/c')
# addInclude('a/b/c/d')
# it means: include everything in a/b, except for things in a/b/c, but do include everything
# in a/b/c/d.
# If multiple rules select the same path location, the last entered rule wins
# The output of getFilteredEntries() is a minimal set of paths listing all included locations.
# A directory in that list means that all subdirectories and files are also included.
import re
# "Bias" are marks which apply to a specific directory, to possibly be overridden by
# anoither mark in a subdirectory.
UNMARKED = 0
INCLUDED = 1
EXCLUDED = 2
# For rendering and debugging bias marks
BIAS = ('(.)', '(+)', '(-)')
# For transforming a shell glob into a regexp
REGEX_QUOTE = re.compile(r'([\\$+{}()?.])')
def splitPath(path, noDuplicate=()):
""" Helper function to remove duplications in paths.
The noDuplicate set contains entries like '...' to be
removed as duplicates (eg '.../...') or at the end of the path.
Entries need to be non-blank, to collapse consecutive slashes.
"""
cleanedPath = []
for entry in path.split('/'):
if entry:
if entry in noDuplicate:
if cleanedPath and cleanedPath[-1] == entry:
continue
cleanedPath.append(entry)
while cleanedPath[-1] in noDuplicate:
cleanedPath.pop()
return cleanedPath
def joinPath(path):
""" Helper function to recreate a proper path.
Since root's name is blank, strip off the first slash.
"""
return "/".join(path).lstrip('/')
class PathFilterException:
def __init__(self, error):
self.error = error
def __str__(self):
return str(self.error)
class PathNode:
def __init__(self, name=''):
""" Helper class to represent a single directory node """
self.name = name
self.mark = UNMARKED
self.seen = False
self.unfilteredKids = 0
self.filteredKids = 0
self.kids = {}
def reset(self):
self.mark = UNMARKED
self.filteredKids = 0
if self.seen:
self.seen = False
for kid in self.kids:
self.kids[kid].reset()
def addEntry(self, path):
""" Add a permanent directory entry """
tail = path[1:]
head = path[0]
if '*' in head or '[' in head or ']' in head or head == '...':
raise PathFilterException('Unsupported character sequence in entry')
if head in self.kids:
if tail:
self.kids[head].addEntry(tail)
else:
raise PathFilterException('Duplicate entry')
else:
self.kids[head] = PathNode(head)
if tail:
self.kids[head].addEntry(tail)
self.unfilteredKids += 1
return self
def addFilter(self, path, bias=INCLUDED):
""" Add a filter condition. This sets the bias marks """
# List of marked items, used both to check whether the rule
# selected something and to keep track so that reset() is cheap.
self.seen = True
# if we are leaf, just set the requested bias
if not path:
self.mark = bias
return 1
# if we are not the leaf, continue processing
marked = 0
tail = path[1:]
head = path[0]
# "..." matches zero or more directories, so....
if head == '...':
# tail must be non-empty, because splitPath() enforces no trailing ...
head = tail.pop(0)
# process path as passed in on all kids, thereby skipping the current dir
for kid in self.kids:
marked += self.kids[kid].addFilter(path, bias)
# Deal with wildcards, if present
if head == '*':
# simplified case...
for kid in self.kids:
marked += self.kids[kid].addFilter(tail, bias)
elif '*' in head or ('[' in head and ']' in head):
try:
headRe = re.compile(REGEX_QUOTE.sub(r'\\\1', head).replace('*','.*'))
except:
raise PathFilterException('Unable to process wildcards')
#print "RE for", head, "is", headRe.pattern
for kid in self.kids:
if headRe.match(kid):
marked += self.kids[kid].addFilter(tail, bias)
# Otherwise check if there is an exact match
elif head in self.kids:
marked += self.kids[head].addFilter(tail, bias)
return marked
def getUnfilteredList(self):
""" Generator returning a dump of the engine state """
if self.seen:
sep = ''
else:
sep = '!'
if self.kids:
for kid in sorted(self.kids.keys()):
for entry in self.kids[kid].getUnfilteredList():
yield [ self.name + BIAS[self.mark] + '[%d/%d]%s' % (self.unfilteredKids, self.filteredKids, sep) ] + entry
else:
yield [ self.name + BIAS[self.mark] + '[%d/%d]%s' % (self.unfilteredKids, self.filteredKids, sep) ]
def setFilteredCount(self, bias):
""" Count number of kids selected by current rule set.
This can be compared with the unfilteredKids to decide whether
to return just this node, or continue recursing into sub-nodes.
"""
self.filteredKids = 0
if self.seen:
if self.mark != UNMARKED:
bias = self.mark
if self.kids:
for kid in self.kids:
self.filteredKids += self.kids[kid].setFilteredCount(bias)
elif bias == INCLUDED:
# this is a leaf node, so if selected it will stand out
self.filteredKids = 1
elif bias == INCLUDED:
# If no rule has touched this node, we either return everything or nothing
if self.kids:
self.filteredKids = self.unfilteredKids
else:
self.filteredKids = 1
return self.filteredKids
def getFilteredList(self):
""" Traverse the tree, dumping the node if either:
- there are kids, but all kids are included
- there are no kids, but the leaf node is selected
"""
if (self.unfilteredKids > 0 and self.filteredKids == self.unfilteredKids) or\
(self.filteredKids > 0 and self.unfilteredKids == 0):
yield [ self.name ]
else:
for kid in sorted(self.kids.keys()):
for entry in self.kids[kid].getFilteredList():
yield [ self.name ] + entry
class PathFilter:
def __init__(self):
""" Filter a set of file paths using wildcarded inclusion and exclusion rules
Usage: See tests below
Restrictions:
File paths are assumed to be relative paths. Leading slashes are ignored.
File names may not contain wildcards.
Supported wildcards:
"..." matches zero or more complete directory entries. As such, it is
only valid within slashes, as in .../a/b, a/.../b or such.
"*" matches zero or more characters within a directory entry.
"[x]" matches enclosed characters, for example [a-z], [xy], [^x] (not x)...
No distinction is made between files and directories. If you want a rule to select
only a directory, you need to end it with "/*".
Trailing "/" or "/..." in a rule have no effect.
Inclusions and exclusions are merged, in the sense that whichever rule matches
the entry closest to the end of the path wins. For example, if you have three rules:
addInclude('a/b')
addExclude('a/b/c')
addInclude('a/b/c/d')
it means: include everything in a/b, except for things in a/b/c, but do include everything
in a/b/c/d.
If multiple rules select the same path location, the last entered rule wins
The output of getFilteredEntries() is a minimal set of paths listing all included locations.
A directory in that list means that all subdirectories and files are also included.
"""
self.root = PathNode()
self.marked = []
def reset(self):
""" Delete all rules and all traces of processing. """
self.root.reset()
return self
def addEntry(self, path):
""" Add a selectable path entry. This must be a unique and complete file path. """
try:
self.root.addEntry(splitPath(path))
except PathFilterException, e:
# re-raise exception, adding the user supplied input for easy reporting
raise PathFilterException("%s: %s" % (str(e), path))
return self
def addEntries(self, paths):
""" Add a list of paths, returning a collected list of exceptions. """
exceptions = []
for path in paths:
try:
self.addEntry(path)
except PathFilterException, e:
exceptions.append(str(e))
return exceptions
def addFilter(self, path, bias=INCLUDED):
""" Add an inclusion or an exclusion rule. """
try:
marked = self.root.addFilter(splitPath(path, ('...',)), bias)
except PathFilterException, e:
# re-raise exception, adding the user supplied input for easy reporting
raise PathFilterException("%s: %s" % (str(e), path))
if not marked:
raise PathFilterException("Filter did not select any items: %s" % path)
return self
def addIncluded(self, path):
""" Add an inclusion rule """
return self.addFilter(path, INCLUDED)
def addExcluded(self, path):
""" Add an exclusion rule """
return self.addFilter(path, EXCLUDED)
def addFilters(self, paths):
""" Add multiple filters. Filters with a "!" at the beginning are exclusions,
everything else is an inclusion. Order matters, the last rule wins over
previous rules matching the same entry.
"""
# we return the list of exceptions instead of throwing one simply because
# this is a batch operation, and my general rule on batch operations is to
# try as many as you can in order to collect a large number of errors, instead
# of just bailing at the first one.
exceptions = []
for path in paths:
path = path.strip()
try:
if path.startswith('!'):
self.addExcluded(path[1:])
else:
self.addIncluded(path)
except PathFilterException, e:
exceptions.append(str(e))
return exceptions
def getFilteredList(self):
""" Generator returning the list of included paths """
# By default, nothing is selected, so we start our traversal
# with bias=EXCLUDED. We must run into a mark=INCLUDED for
# things to start being counted as selected.
self.root.setFilteredCount(EXCLUDED)
# setFilteredCount actually does the hard work, here we
# simply dump out recursively...
for entry in self.root.getFilteredList():
yield joinPath(entry)
def getUnfilteredList(self):
""" Dump the current engine state """
for entry in self.root.getUnfilteredList():
yield joinPath(entry)
def dumpFilteredList(self, title):
""" Test helper function to dump a filtered list """
empty = True
print "===[ %s ]===" % title
for entry in self.getFilteredList():
print ">%s<" % entry
empty = False
if empty:
print "empty"
def dumpUnfilteredList(self, title):
""" Test helper function to dump the current engine state """
empty = True
print "===[ %s ]===" % title
for entry in self.getUnfilteredList():
print ">%s<" % entry
empty = False
if empty:
print "empty"
if __name__ == '__main__':
f = PathFilter()
f.addEntry('my-dir-a/my-dir-b/1')
f.addEntry('//my-dir-a///my-dir-b/2')
f.addEntry('my-dir-a/my-dir-b/3')
f.addEntry('my-dir-a/my-dir-b/my-dir-x/1')
f.addEntry('my-dir-a/my-dir-b/my-dir-x/2')
f.addEntry('my-dir-a/my-dir-b/my-dir-x/3')
f.addEntry('my-dir-a/my-dir-b/my-dir-x/my-dir-y/1')
f.addEntry('my-dir-a/my-dir-b/my-dir-x/my-dir-y/2')
f.addEntry('my-dir-a/my-dir-b/my-dir-x/my-dir-y/3')
f.addEntry('your-dir-a/your-dir-b/1')
f.addEntry('your-dir-a/your-dir-b/2')
f.addEntry('your-dir-a/your-dir-b/3')
f.addEntry('your-dir-a/your-dir-b/3.text')
f.addEntry('your-dir-a/your-dir-b/3Xtext')
f.addEntry('your-dir-a/your-dir-b/your-dir-x/1')
f.addEntry('your-dir-a/your-dir-b/your-dir-x/2')
f.addEntry('your-dir-a/your-dir-b/your-dir-x/3')
f.addEntry('your-dir-a/your-dir-b/your-dir-x/your-dir-y/1')
f.addEntry('your-dir-a/your-dir-b/your-dir-x/your-dir-y/2')
f.addEntry('your-dir-a/your-dir-b/your-dir-x/your-dir-y/3')
f.addEntry('your-dir-a/x1/x2/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999')
f.addEntry('your-dir-a/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999')
f.addEntry('your-dir-a/x1/x2/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/your-dir-y/999')
f.addEntry('your-dir-a/x1/your-dir-b/y1/y3/your-dir-x/z1/z2/z3/your-dir-y/999')
f.addEntry('your-dir-a/x1/x2/x3/your-dir-b/y1/y2/your-dir-x/z1/z2/your-dir-y/999')
f.addEntry('your-dir-a/x1/your-dir-b/y1/your-dir-x/z1/your-dir-y/999')
for entry in ('my-dir-a', 'a/.../b', 'a/x*b/c'):
try:
f.addEntry(entry)
except PathFilterException, e:
print "Caught exception %s, good" % str(e)
else:
print "Expected exception."
f.dumpFilteredList('Initial load - should be empty)')
#f.dumpUnfilteredList('Initial load - should be everything)')
f.addIncluded('my-dir-a/my-dir-b/my-dir-x')
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x')
#f.dumpUnfilteredList('Including my-dir-a/my-dir-b/my-dir-x')
f.addExcluded('my-dir-a/my-dir-b/my-dir-x/my-dir-y')
f.dumpFilteredList('Excluding my-dir-a/my-dir-b/my-dir-x/my-dir-y')
#f.dumpUnfilteredList('Excluding my-dir-a/my-dir-b/my-dir-x/my-dir-y')
f.addIncluded('my-dir-a/my-dir-b/my-dir-x/my-dir-y/1')
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x/my-dir-y/1')
#f.dumpUnfilteredList('Including my-dir-a/my-dir-b/my-dir-x/my-dir-y/1')
f.reset()
f.dumpFilteredList('After reset')
#f.dumpUnfilteredList('After reset')
f.addIncluded('my-dir-a/my-dir-b/my-dir-x')
#f.dumpUnfilteredList('Including my-dir-a/my-dir-b/my-dir-x/my-dir-y/1')
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x')
f.addIncluded('my-dir-a/my-dir-b/my-dir-x/*')
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x (should be the same)')
try:
f.addIncluded('.../1/*')
except PathFilterException, e:
print "Caught exception %s, good" % str(e)
else:
print "Expected exception."
f.dumpFilteredList('Including .../1/* (should be the same)')
f.reset()
f.dumpFilteredList('After reset')
f.addIncluded('.../1')
f.dumpFilteredList('Including .../1')
f.addExcluded('.../my-dir-x/1')
f.dumpFilteredList('Excluding .../my-dir-x/1')
f.addIncluded('your-dir-a/...')
f.dumpFilteredList('Including your-dir-a/...')
f.addExcluded('.../your-dir-*')
f.dumpFilteredList('Excluding .../your-dir-*')
f.reset()
f.dumpFilteredList('After reset')
f.addIncluded('.../.../1')
f.dumpFilteredList('Including .../.../1')
try:
f.addIncluded('.../.../1/*')
except PathFilterException, e:
print "Caught exception %s, good" % str(e)
else:
print "Expected exception."
f.dumpFilteredList('Including .../.../1/*')
f.addExcluded('.../.../my-dir-x/1')
f.dumpFilteredList('Excluding .../.../my-dir-x/1')
f.addIncluded('your-dir-a/.../...')
f.dumpFilteredList('Including your-dir-a/.../...')
f.addExcluded('.../.../your-dir-*')
f.dumpFilteredList('Excluding .../.../your-dir-*')
f.reset()
f.dumpFilteredList('After reset')
f.addIncluded('your-dir-a/.../your-dir-b/.../your-dir-x/.../your-dir-y')
f.dumpFilteredList('Include your-dir-a/.../your-dir-b/.../your-dir-x/.../your-dir-y')
f.addExcluded('.../[1-3]')
f.dumpFilteredList('Exclude .../[1-3]')
f.addExcluded('.../y2')
f.dumpFilteredList('Exclude .../y2')
f.addExcluded('.../y2/.../999')
f.dumpFilteredList('Exclude .../y2/.../999')
f.reset()
f.dumpFilteredList('After reset')
f.addIncluded('your-dir-a/*/your-dir-b/*/your-dir-x/*/your-dir-y')
f.dumpFilteredList('Include your-dir-a/*/your-dir-b/*/your-dir-x/*/your-dir-y')
f.reset()
f.dumpFilteredList('After reset')
f.addIncluded('.../*.text')
f.dumpFilteredList('Include .../*.text (should not match 3Xtext)')
print "try again with batch oriented functions"
f = PathFilter()
errors = f.addEntries(['my-dir-a/my-dir-b/1',
'//my-dir-a///my-dir-b/2',
'my-dir-a/my-dir-b/3',
'my-dir-a/my-dir-b/my-dir-x/1',
'my-dir-a/my-dir-b/my-dir-x/2',
'my-dir-a/my-dir-b/my-dir-x/3',
'my-dir-a/my-dir-b/my-dir-x/my-dir-y/1',
'my-dir-a/my-dir-b/my-dir-x/my-dir-y/2',
'my-dir-a/my-dir-b/my-dir-x/my-dir-y/3',
'your-dir-a/your-dir-b/1',
'your-dir-a/your-dir-b/2',
'your-dir-a/your-dir-b/3',
'your-dir-a/your-dir-b/3.text',
'your-dir-a/your-dir-b/3Xtext',
'your-dir-a/your-dir-b/your-dir-x/1',
'your-dir-a/your-dir-b/your-dir-x/2',
'your-dir-a/your-dir-b/your-dir-x/3',
'your-dir-a/your-dir-b/your-dir-x/your-dir-y/1',
'your-dir-a/your-dir-b/your-dir-x/your-dir-y/2',
'your-dir-a/your-dir-b/your-dir-x/your-dir-y/3',
'your-dir-a/x1/x2/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999',
'your-dir-a/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999',
'your-dir-a/x1/x2/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/your-dir-y/999',
'your-dir-a/x1/your-dir-b/y1/y3/your-dir-x/z1/z2/z3/your-dir-y/999',
'your-dir-a/x1/x2/x3/your-dir-b/y1/y2/your-dir-x/z1/z2/your-dir-y/999',
'your-dir-a/x1/your-dir-b/y1/your-dir-x/z1/your-dir-y/999',
'my-dir-a',
'a/.../b',
'a/x*b/c'])
print "Exceptions in initial load"
print "\n".join(errors)
f.dumpFilteredList('Initial load - should be empty)')
f.dumpUnfilteredList('Initial load - should be everything)')
f.dumpFilteredList('After reset')
errors = f.addFilters(['.../.../1',
'.../.../1/*',
'!.../.../my-dir-x/1'])
print "Exceptions in adding filters"
print "\n".join(errors)
f.dumpFilteredList('Including .../.../1, Excluding .../.../my-dir-x/1')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment