Last active
May 3, 2017 22:41
-
-
Save cg-soft/f89cc18984e970d62270a7504834c5f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
# Filter a set of file paths using wildcarded inclusion and exclusion rules | |
# | |
# Usage: See tests below | |
# | |
# Restrictions: | |
# File paths are assumed to be relative paths. Leading slashes are ignored. | |
# File names may not contain wildcards. | |
# Supported wildcards: | |
# "..." matches zero or more complete directory entries. As such, it is | |
# only valid within slashes, as in .../a/b, a/.../b or such. | |
# "*" matches zero or more characters within a directory entry. | |
# "[x]" matches enclosed characters, for example [a-z], [xy], [^x] (not x)... | |
# No distinction is made between files and directories. If you want a rule to select | |
# only a directory, you need to end it with "/*". | |
# Trailing "/" or "/..." in a rule have no effect. | |
# Inclusions and exclusions are merged, in the sense that whichever rule matches | |
# the entry closest to the end of the path wins. For example, if you have three rules: | |
# addInclude('a/b') | |
# addExclude('a/b/c') | |
# addInclude('a/b/c/d') | |
# it means: include everything in a/b, except for things in a/b/c, but do include everything | |
# in a/b/c/d. | |
# If multiple rules select the same path location, the last entered rule wins | |
# The output of getFilteredEntries() is a minimal set of paths listing all included locations. | |
# A directory in that list means that all subdirectories and files are also included. | |
import re | |
# "Bias" are marks which apply to a specific directory, to possibly be overridden by | |
# anoither mark in a subdirectory. | |
UNMARKED = 0 | |
INCLUDED = 1 | |
EXCLUDED = 2 | |
# For rendering and debugging bias marks | |
BIAS = ('(.)', '(+)', '(-)') | |
# For transforming a shell glob into a regexp | |
REGEX_QUOTE = re.compile(r'([\\$+{}()?.])') | |
def splitPath(path, noDuplicate=()): | |
""" Helper function to remove duplications in paths. | |
The noDuplicate set contains entries like '...' to be | |
removed as duplicates (eg '.../...') or at the end of the path. | |
Entries need to be non-blank, to collapse consecutive slashes. | |
""" | |
cleanedPath = [] | |
for entry in path.split('/'): | |
if entry: | |
if entry in noDuplicate: | |
if cleanedPath and cleanedPath[-1] == entry: | |
continue | |
cleanedPath.append(entry) | |
while cleanedPath[-1] in noDuplicate: | |
cleanedPath.pop() | |
return cleanedPath | |
def joinPath(path): | |
""" Helper function to recreate a proper path. | |
Since root's name is blank, strip off the first slash. | |
""" | |
return "/".join(path).lstrip('/') | |
class PathFilterException: | |
def __init__(self, error): | |
self.error = error | |
def __str__(self): | |
return str(self.error) | |
class PathNode: | |
def __init__(self, name=''): | |
""" Helper class to represent a single directory node """ | |
self.name = name | |
self.mark = UNMARKED | |
self.seen = False | |
self.unfilteredKids = 0 | |
self.filteredKids = 0 | |
self.kids = {} | |
def reset(self): | |
self.mark = UNMARKED | |
self.filteredKids = 0 | |
if self.seen: | |
self.seen = False | |
for kid in self.kids: | |
self.kids[kid].reset() | |
def addEntry(self, path): | |
""" Add a permanent directory entry """ | |
tail = path[1:] | |
head = path[0] | |
if '*' in head or '[' in head or ']' in head or head == '...': | |
raise PathFilterException('Unsupported character sequence in entry') | |
if head in self.kids: | |
if tail: | |
self.kids[head].addEntry(tail) | |
else: | |
raise PathFilterException('Duplicate entry') | |
else: | |
self.kids[head] = PathNode(head) | |
if tail: | |
self.kids[head].addEntry(tail) | |
self.unfilteredKids += 1 | |
return self | |
def addFilter(self, path, bias=INCLUDED): | |
""" Add a filter condition. This sets the bias marks """ | |
# List of marked items, used both to check whether the rule | |
# selected something and to keep track so that reset() is cheap. | |
self.seen = True | |
# if we are leaf, just set the requested bias | |
if not path: | |
self.mark = bias | |
return 1 | |
# if we are not the leaf, continue processing | |
marked = 0 | |
tail = path[1:] | |
head = path[0] | |
# "..." matches zero or more directories, so.... | |
if head == '...': | |
# tail must be non-empty, because splitPath() enforces no trailing ... | |
head = tail.pop(0) | |
# process path as passed in on all kids, thereby skipping the current dir | |
for kid in self.kids: | |
marked += self.kids[kid].addFilter(path, bias) | |
# Deal with wildcards, if present | |
if head == '*': | |
# simplified case... | |
for kid in self.kids: | |
marked += self.kids[kid].addFilter(tail, bias) | |
elif '*' in head or ('[' in head and ']' in head): | |
try: | |
headRe = re.compile(REGEX_QUOTE.sub(r'\\\1', head).replace('*','.*')) | |
except: | |
raise PathFilterException('Unable to process wildcards') | |
#print "RE for", head, "is", headRe.pattern | |
for kid in self.kids: | |
if headRe.match(kid): | |
marked += self.kids[kid].addFilter(tail, bias) | |
# Otherwise check if there is an exact match | |
elif head in self.kids: | |
marked += self.kids[head].addFilter(tail, bias) | |
return marked | |
def getUnfilteredList(self): | |
""" Generator returning a dump of the engine state """ | |
if self.seen: | |
sep = '' | |
else: | |
sep = '!' | |
if self.kids: | |
for kid in sorted(self.kids.keys()): | |
for entry in self.kids[kid].getUnfilteredList(): | |
yield [ self.name + BIAS[self.mark] + '[%d/%d]%s' % (self.unfilteredKids, self.filteredKids, sep) ] + entry | |
else: | |
yield [ self.name + BIAS[self.mark] + '[%d/%d]%s' % (self.unfilteredKids, self.filteredKids, sep) ] | |
def setFilteredCount(self, bias): | |
""" Count number of kids selected by current rule set. | |
This can be compared with the unfilteredKids to decide whether | |
to return just this node, or continue recursing into sub-nodes. | |
""" | |
self.filteredKids = 0 | |
if self.seen: | |
if self.mark != UNMARKED: | |
bias = self.mark | |
if self.kids: | |
for kid in self.kids: | |
self.filteredKids += self.kids[kid].setFilteredCount(bias) | |
elif bias == INCLUDED: | |
# this is a leaf node, so if selected it will stand out | |
self.filteredKids = 1 | |
elif bias == INCLUDED: | |
# If no rule has touched this node, we either return everything or nothing | |
if self.kids: | |
self.filteredKids = self.unfilteredKids | |
else: | |
self.filteredKids = 1 | |
return self.filteredKids | |
def getFilteredList(self): | |
""" Traverse the tree, dumping the node if either: | |
- there are kids, but all kids are included | |
- there are no kids, but the leaf node is selected | |
""" | |
if (self.unfilteredKids > 0 and self.filteredKids == self.unfilteredKids) or\ | |
(self.filteredKids > 0 and self.unfilteredKids == 0): | |
yield [ self.name ] | |
else: | |
for kid in sorted(self.kids.keys()): | |
for entry in self.kids[kid].getFilteredList(): | |
yield [ self.name ] + entry | |
class PathFilter: | |
def __init__(self): | |
""" Filter a set of file paths using wildcarded inclusion and exclusion rules | |
Usage: See tests below | |
Restrictions: | |
File paths are assumed to be relative paths. Leading slashes are ignored. | |
File names may not contain wildcards. | |
Supported wildcards: | |
"..." matches zero or more complete directory entries. As such, it is | |
only valid within slashes, as in .../a/b, a/.../b or such. | |
"*" matches zero or more characters within a directory entry. | |
"[x]" matches enclosed characters, for example [a-z], [xy], [^x] (not x)... | |
No distinction is made between files and directories. If you want a rule to select | |
only a directory, you need to end it with "/*". | |
Trailing "/" or "/..." in a rule have no effect. | |
Inclusions and exclusions are merged, in the sense that whichever rule matches | |
the entry closest to the end of the path wins. For example, if you have three rules: | |
addInclude('a/b') | |
addExclude('a/b/c') | |
addInclude('a/b/c/d') | |
it means: include everything in a/b, except for things in a/b/c, but do include everything | |
in a/b/c/d. | |
If multiple rules select the same path location, the last entered rule wins | |
The output of getFilteredEntries() is a minimal set of paths listing all included locations. | |
A directory in that list means that all subdirectories and files are also included. | |
""" | |
self.root = PathNode() | |
self.marked = [] | |
def reset(self): | |
""" Delete all rules and all traces of processing. """ | |
self.root.reset() | |
return self | |
def addEntry(self, path): | |
""" Add a selectable path entry. This must be a unique and complete file path. """ | |
try: | |
self.root.addEntry(splitPath(path)) | |
except PathFilterException, e: | |
# re-raise exception, adding the user supplied input for easy reporting | |
raise PathFilterException("%s: %s" % (str(e), path)) | |
return self | |
def addEntries(self, paths): | |
""" Add a list of paths, returning a collected list of exceptions. """ | |
exceptions = [] | |
for path in paths: | |
try: | |
self.addEntry(path) | |
except PathFilterException, e: | |
exceptions.append(str(e)) | |
return exceptions | |
def addFilter(self, path, bias=INCLUDED): | |
""" Add an inclusion or an exclusion rule. """ | |
try: | |
marked = self.root.addFilter(splitPath(path, ('...',)), bias) | |
except PathFilterException, e: | |
# re-raise exception, adding the user supplied input for easy reporting | |
raise PathFilterException("%s: %s" % (str(e), path)) | |
if not marked: | |
raise PathFilterException("Filter did not select any items: %s" % path) | |
return self | |
def addIncluded(self, path): | |
""" Add an inclusion rule """ | |
return self.addFilter(path, INCLUDED) | |
def addExcluded(self, path): | |
""" Add an exclusion rule """ | |
return self.addFilter(path, EXCLUDED) | |
def addFilters(self, paths): | |
""" Add multiple filters. Filters with a "!" at the beginning are exclusions, | |
everything else is an inclusion. Order matters, the last rule wins over | |
previous rules matching the same entry. | |
""" | |
# we return the list of exceptions instead of throwing one simply because | |
# this is a batch operation, and my general rule on batch operations is to | |
# try as many as you can in order to collect a large number of errors, instead | |
# of just bailing at the first one. | |
exceptions = [] | |
for path in paths: | |
path = path.strip() | |
try: | |
if path.startswith('!'): | |
self.addExcluded(path[1:]) | |
else: | |
self.addIncluded(path) | |
except PathFilterException, e: | |
exceptions.append(str(e)) | |
return exceptions | |
def getFilteredList(self): | |
""" Generator returning the list of included paths """ | |
# By default, nothing is selected, so we start our traversal | |
# with bias=EXCLUDED. We must run into a mark=INCLUDED for | |
# things to start being counted as selected. | |
self.root.setFilteredCount(EXCLUDED) | |
# setFilteredCount actually does the hard work, here we | |
# simply dump out recursively... | |
for entry in self.root.getFilteredList(): | |
yield joinPath(entry) | |
def getUnfilteredList(self): | |
""" Dump the current engine state """ | |
for entry in self.root.getUnfilteredList(): | |
yield joinPath(entry) | |
def dumpFilteredList(self, title): | |
""" Test helper function to dump a filtered list """ | |
empty = True | |
print "===[ %s ]===" % title | |
for entry in self.getFilteredList(): | |
print ">%s<" % entry | |
empty = False | |
if empty: | |
print "empty" | |
def dumpUnfilteredList(self, title): | |
""" Test helper function to dump the current engine state """ | |
empty = True | |
print "===[ %s ]===" % title | |
for entry in self.getUnfilteredList(): | |
print ">%s<" % entry | |
empty = False | |
if empty: | |
print "empty" | |
if __name__ == '__main__': | |
f = PathFilter() | |
f.addEntry('my-dir-a/my-dir-b/1') | |
f.addEntry('//my-dir-a///my-dir-b/2') | |
f.addEntry('my-dir-a/my-dir-b/3') | |
f.addEntry('my-dir-a/my-dir-b/my-dir-x/1') | |
f.addEntry('my-dir-a/my-dir-b/my-dir-x/2') | |
f.addEntry('my-dir-a/my-dir-b/my-dir-x/3') | |
f.addEntry('my-dir-a/my-dir-b/my-dir-x/my-dir-y/1') | |
f.addEntry('my-dir-a/my-dir-b/my-dir-x/my-dir-y/2') | |
f.addEntry('my-dir-a/my-dir-b/my-dir-x/my-dir-y/3') | |
f.addEntry('your-dir-a/your-dir-b/1') | |
f.addEntry('your-dir-a/your-dir-b/2') | |
f.addEntry('your-dir-a/your-dir-b/3') | |
f.addEntry('your-dir-a/your-dir-b/3.text') | |
f.addEntry('your-dir-a/your-dir-b/3Xtext') | |
f.addEntry('your-dir-a/your-dir-b/your-dir-x/1') | |
f.addEntry('your-dir-a/your-dir-b/your-dir-x/2') | |
f.addEntry('your-dir-a/your-dir-b/your-dir-x/3') | |
f.addEntry('your-dir-a/your-dir-b/your-dir-x/your-dir-y/1') | |
f.addEntry('your-dir-a/your-dir-b/your-dir-x/your-dir-y/2') | |
f.addEntry('your-dir-a/your-dir-b/your-dir-x/your-dir-y/3') | |
f.addEntry('your-dir-a/x1/x2/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999') | |
f.addEntry('your-dir-a/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999') | |
f.addEntry('your-dir-a/x1/x2/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/your-dir-y/999') | |
f.addEntry('your-dir-a/x1/your-dir-b/y1/y3/your-dir-x/z1/z2/z3/your-dir-y/999') | |
f.addEntry('your-dir-a/x1/x2/x3/your-dir-b/y1/y2/your-dir-x/z1/z2/your-dir-y/999') | |
f.addEntry('your-dir-a/x1/your-dir-b/y1/your-dir-x/z1/your-dir-y/999') | |
for entry in ('my-dir-a', 'a/.../b', 'a/x*b/c'): | |
try: | |
f.addEntry(entry) | |
except PathFilterException, e: | |
print "Caught exception %s, good" % str(e) | |
else: | |
print "Expected exception." | |
f.dumpFilteredList('Initial load - should be empty)') | |
#f.dumpUnfilteredList('Initial load - should be everything)') | |
f.addIncluded('my-dir-a/my-dir-b/my-dir-x') | |
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x') | |
#f.dumpUnfilteredList('Including my-dir-a/my-dir-b/my-dir-x') | |
f.addExcluded('my-dir-a/my-dir-b/my-dir-x/my-dir-y') | |
f.dumpFilteredList('Excluding my-dir-a/my-dir-b/my-dir-x/my-dir-y') | |
#f.dumpUnfilteredList('Excluding my-dir-a/my-dir-b/my-dir-x/my-dir-y') | |
f.addIncluded('my-dir-a/my-dir-b/my-dir-x/my-dir-y/1') | |
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x/my-dir-y/1') | |
#f.dumpUnfilteredList('Including my-dir-a/my-dir-b/my-dir-x/my-dir-y/1') | |
f.reset() | |
f.dumpFilteredList('After reset') | |
#f.dumpUnfilteredList('After reset') | |
f.addIncluded('my-dir-a/my-dir-b/my-dir-x') | |
#f.dumpUnfilteredList('Including my-dir-a/my-dir-b/my-dir-x/my-dir-y/1') | |
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x') | |
f.addIncluded('my-dir-a/my-dir-b/my-dir-x/*') | |
f.dumpFilteredList('Including my-dir-a/my-dir-b/my-dir-x (should be the same)') | |
try: | |
f.addIncluded('.../1/*') | |
except PathFilterException, e: | |
print "Caught exception %s, good" % str(e) | |
else: | |
print "Expected exception." | |
f.dumpFilteredList('Including .../1/* (should be the same)') | |
f.reset() | |
f.dumpFilteredList('After reset') | |
f.addIncluded('.../1') | |
f.dumpFilteredList('Including .../1') | |
f.addExcluded('.../my-dir-x/1') | |
f.dumpFilteredList('Excluding .../my-dir-x/1') | |
f.addIncluded('your-dir-a/...') | |
f.dumpFilteredList('Including your-dir-a/...') | |
f.addExcluded('.../your-dir-*') | |
f.dumpFilteredList('Excluding .../your-dir-*') | |
f.reset() | |
f.dumpFilteredList('After reset') | |
f.addIncluded('.../.../1') | |
f.dumpFilteredList('Including .../.../1') | |
try: | |
f.addIncluded('.../.../1/*') | |
except PathFilterException, e: | |
print "Caught exception %s, good" % str(e) | |
else: | |
print "Expected exception." | |
f.dumpFilteredList('Including .../.../1/*') | |
f.addExcluded('.../.../my-dir-x/1') | |
f.dumpFilteredList('Excluding .../.../my-dir-x/1') | |
f.addIncluded('your-dir-a/.../...') | |
f.dumpFilteredList('Including your-dir-a/.../...') | |
f.addExcluded('.../.../your-dir-*') | |
f.dumpFilteredList('Excluding .../.../your-dir-*') | |
f.reset() | |
f.dumpFilteredList('After reset') | |
f.addIncluded('your-dir-a/.../your-dir-b/.../your-dir-x/.../your-dir-y') | |
f.dumpFilteredList('Include your-dir-a/.../your-dir-b/.../your-dir-x/.../your-dir-y') | |
f.addExcluded('.../[1-3]') | |
f.dumpFilteredList('Exclude .../[1-3]') | |
f.addExcluded('.../y2') | |
f.dumpFilteredList('Exclude .../y2') | |
f.addExcluded('.../y2/.../999') | |
f.dumpFilteredList('Exclude .../y2/.../999') | |
f.reset() | |
f.dumpFilteredList('After reset') | |
f.addIncluded('your-dir-a/*/your-dir-b/*/your-dir-x/*/your-dir-y') | |
f.dumpFilteredList('Include your-dir-a/*/your-dir-b/*/your-dir-x/*/your-dir-y') | |
f.reset() | |
f.dumpFilteredList('After reset') | |
f.addIncluded('.../*.text') | |
f.dumpFilteredList('Include .../*.text (should not match 3Xtext)') | |
print "try again with batch oriented functions" | |
f = PathFilter() | |
errors = f.addEntries(['my-dir-a/my-dir-b/1', | |
'//my-dir-a///my-dir-b/2', | |
'my-dir-a/my-dir-b/3', | |
'my-dir-a/my-dir-b/my-dir-x/1', | |
'my-dir-a/my-dir-b/my-dir-x/2', | |
'my-dir-a/my-dir-b/my-dir-x/3', | |
'my-dir-a/my-dir-b/my-dir-x/my-dir-y/1', | |
'my-dir-a/my-dir-b/my-dir-x/my-dir-y/2', | |
'my-dir-a/my-dir-b/my-dir-x/my-dir-y/3', | |
'your-dir-a/your-dir-b/1', | |
'your-dir-a/your-dir-b/2', | |
'your-dir-a/your-dir-b/3', | |
'your-dir-a/your-dir-b/3.text', | |
'your-dir-a/your-dir-b/3Xtext', | |
'your-dir-a/your-dir-b/your-dir-x/1', | |
'your-dir-a/your-dir-b/your-dir-x/2', | |
'your-dir-a/your-dir-b/your-dir-x/3', | |
'your-dir-a/your-dir-b/your-dir-x/your-dir-y/1', | |
'your-dir-a/your-dir-b/your-dir-x/your-dir-y/2', | |
'your-dir-a/your-dir-b/your-dir-x/your-dir-y/3', | |
'your-dir-a/x1/x2/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999', | |
'your-dir-a/x3/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/z3/your-dir-y/999', | |
'your-dir-a/x1/x2/your-dir-b/y1/y2/y3/your-dir-x/z1/z2/your-dir-y/999', | |
'your-dir-a/x1/your-dir-b/y1/y3/your-dir-x/z1/z2/z3/your-dir-y/999', | |
'your-dir-a/x1/x2/x3/your-dir-b/y1/y2/your-dir-x/z1/z2/your-dir-y/999', | |
'your-dir-a/x1/your-dir-b/y1/your-dir-x/z1/your-dir-y/999', | |
'my-dir-a', | |
'a/.../b', | |
'a/x*b/c']) | |
print "Exceptions in initial load" | |
print "\n".join(errors) | |
f.dumpFilteredList('Initial load - should be empty)') | |
f.dumpUnfilteredList('Initial load - should be everything)') | |
f.dumpFilteredList('After reset') | |
errors = f.addFilters(['.../.../1', | |
'.../.../1/*', | |
'!.../.../my-dir-x/1']) | |
print "Exceptions in adding filters" | |
print "\n".join(errors) | |
f.dumpFilteredList('Including .../.../1, Excluding .../.../my-dir-x/1') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment