Created
May 21, 2018 21:40
-
-
Save zsimic/cf09e659a4c95468ee423bb940af48ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Support for understanding files/folders to ignore (from .gitignore or .svnignore) | |
These classes should be reusable, can be moved to a library | |
Example usage: | |
ignored_file = IgnoredFiles(mypath) | |
for root, dirs, files in os.walk(mypath): | |
ignored_files.remove_ignored_folders(root, dirs) | |
for filename in files: | |
fpath = os.path.join(root, filename) | |
if ignored_files.match(fpath, is_dir=False): | |
continue | |
... | |
""" | |
import fnmatch | |
import os | |
import re | |
class IgnoredFiles(object): | |
""" | |
Understand ignored files/folders (from .gitignore and/or .svnignore) | |
Allows to avoid walking through build and config folders | |
""" | |
def __init__(self, path, basename='.gitignore', use_default_ignores=True): | |
""" | |
:param str path: Path to folder we want to know ignore | |
:param str basename: Basename of the "ignore file" | |
:param bool use_default_ignores: Auto-add all usual ignores? (ie: .git/ .gradle/ etc) | |
""" | |
self.root = os.path.abspath(path) | |
self.invalid = [] | |
self._regexes = set() | |
self.basename = basename | |
self.parse_gitignore(os.path.join(self.root, basename)) | |
if use_default_ignores: | |
self.add('.git/') | |
self.add('.gradle/') | |
self.add('.idea/') | |
self.add('acl/') | |
self.add('build/') | |
self.add('dist/') | |
def __repr__(self): | |
return '%s ignores, %s invalid' % (len(self._regexes), len(self.invalid)) | |
def __len__(self): | |
return len(self._regexes) | |
def match(self, path, is_dir=None): | |
""" | |
:param str path: Full path to file | |
:param bool|None is_dir: If applicable, caller can indicate wether 'full_path' is a directory or not (to save a file stat call) | |
:return IgnorePattern|None: Pattern that leads to 'path' being ignored, if any | |
""" | |
for regex in self._regexes: | |
if regex.match(path, is_dir=is_dir): | |
return regex | |
return None | |
def show_ignores(self): | |
""" | |
Useful for debugging, show which files would be ignored in self.root, and why (due to which pattern) | |
""" | |
result = [] | |
for root, dirs, files in os.walk(self.root): | |
for basename in dirs[:]: | |
fpath = os.path.join(root, basename) | |
relative_path = fpath[len(self.root) + 1:] | |
m = self.match(fpath, is_dir=True) | |
if m: | |
dirs.remove(basename) | |
result.append("%-30s: %s" % (m.description, relative_path)) | |
for basename in files: | |
fpath = os.path.join(root, basename) | |
relative_path = fpath[len(self.root) + 1:] | |
m = self.match(fpath, is_dir=False) | |
if m: | |
result.append("%-30s: %s" % (m.description, relative_path)) | |
return '\n'.join(result) | |
def parse_gitignore(self, path): | |
""" | |
Add ignores as defined in .gitignore file with 'path' | |
:param str path: Path to .gitignore file | |
""" | |
try: | |
if not os.path.exists(path): | |
self.invalid.append("No folder %s" % path) | |
return | |
line_number = 0 | |
with open(path) as fh: | |
for line in fh: | |
line_number += 1 | |
line = line.strip() | |
if not line or line[0] == '#': | |
continue | |
if line[0] == '!': | |
# Negation patterns not supported... if .gitignore is uber crazy, then too bad, we possibly won't find all .py files in such esoteric setups | |
self.invalid.append("Negation pattern line %s not supported" % line_number) | |
continue | |
self.add(line, line_number) | |
except Exception as e: | |
self.invalid.append('Crashed: %s' % e) | |
def remove_ignored_folders(self, root, dirs): | |
""" | |
Remove all 'names' that should be ignored, handy for use with os.walk | |
:param str root: Parent of 'dirs' | |
:param list dirs: List of dirs to remove ignored folders (as defined in this object) from | |
""" | |
for basename in dirs[:]: | |
if self.match(os.path.join(root, basename), is_dir=True): | |
dirs.remove(basename) | |
def add(self, pattern, line_number=0): | |
""" | |
:param str pattern: Pattern to ignore | |
:param int line_number: Line number in .gitignore file | |
""" | |
pat = IgnorePattern(self.root, pattern, self.basename, line_number) | |
if pat.invalid: | |
self.invalid.append(pat) | |
else: | |
self._regexes.add(pat) | |
class IgnorePattern(object): | |
""" | |
Represents a .gitignore pattern, compatible with https://git-scm.com/docs/gitignore | |
""" | |
def __init__(self, root, pattern, basename, line_number): | |
""" | |
:param str root: Folder containing ignore file | |
:param str pattern: Pattern from ignore file | |
:param str basename: Basename of ignore file | |
:param int line_number: Line number in ignore file | |
""" | |
self.root = root | |
self.pattern = pattern | |
self.description = '%s:%2s:%s' % (basename, line_number, pattern) | |
self.invalid = None | |
self.applies_to_directories = False # When True, this pattern applies to directories only (not files or symlinks) | |
self.match_basename = False # When True, match against filename (otherwise: relative path) | |
self.exact_match = None # Exact string to match (no regex needed) | |
self.regex = None # Regex to use | |
if pattern.endswith('/') and not pattern.endswith('*/'): | |
# Anything ending with '/' simply means pattern applies to directories only | |
self.applies_to_directories = True | |
pattern = pattern[:-1] | |
if pattern.startswith('**/'): | |
pattern = pattern[3:] | |
if self._has_glob(pattern): | |
self.invalid = "Too complex" | |
return | |
if not pattern: | |
self.match_basename = True | |
self.regex = re.compile('.*') | |
elif '/' in pattern: | |
# **/foo/bar | |
self.match_basename = False | |
self.regex = re.compile('.*/%s' % re.escape(pattern)) | |
else: | |
# **/foo is the same as ignoring basename foo | |
self.match_basename = True | |
self.exact_match = pattern | |
return | |
if pattern.endswith('/**'): | |
pattern = pattern[:-3] | |
if self._has_glob(pattern): | |
self.invalid = "Too complex" | |
return | |
self.applies_to_directories = True | |
if not pattern: | |
self.match_basename = True | |
self.regex = re.compile('.*') | |
return | |
if '/**/' in pattern: | |
first, _, second = pattern.partition('/**/') | |
if self._has_glob(first + second): | |
self.invalid = "Too complex" | |
return | |
# Provide regex representing "foo/**/bar" | |
self.regex = re.compile('%s(/.*/|/)?%s' % (re.escape(first), re.escape(second))) | |
return | |
if '**' in pattern: | |
self.invalid = "Not supported" | |
return | |
if pattern.startswith('/'): | |
# We're doing matching | |
self.match_basename = False | |
pattern = pattern[1:] | |
elif '/' not in pattern: | |
self.match_basename = True | |
if self._has_glob(pattern): | |
pattern = fnmatch.translate(pattern) | |
self.regex = re.compile(pattern) | |
return | |
self.exact_match = pattern | |
def __repr__(self): | |
return self.pattern | |
def _has_glob(self, pattern): | |
""" | |
:return bool: True if pattern have a shell glob (that can be turned into a regex by fnmatch.translate()) | |
""" | |
return '*' in pattern or '?' in pattern | |
def match(self, full_path, is_dir=None): | |
""" | |
:param str full_path: Full path to file or folder | |
:param bool|None is_dir: If applicable, caller can indicate wether 'full_path' is a directory or not (to save a file stat call) | |
:return bool: True if 'full_path' is an ignore-match by this pattern, False otherwise | |
""" | |
assert not self.invalid | |
if self.applies_to_directories: | |
if is_dir is None: | |
is_dir = os.path.isdir(full_path) | |
if not is_dir: | |
return False | |
if self.match_basename: | |
name = os.path.basename(full_path) | |
else: | |
name = full_path[len(self.root) + 1:] | |
if self.exact_match: | |
return name == self.exact_match | |
assert self.regex | |
return self.regex.match(name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment