Skip to content

Instantly share code, notes, and snippets.

@zsimic
Created May 21, 2018 21:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zsimic/cf09e659a4c95468ee423bb940af48ad to your computer and use it in GitHub Desktop.
Save zsimic/cf09e659a4c95468ee423bb940af48ad to your computer and use it in GitHub Desktop.
"""
Support for understanding files/folders to ignore (from .gitignore or .svnignore)
These classes should be reusable, can be moved to a library
Example usage:
ignored_file = IgnoredFiles(mypath)
for root, dirs, files in os.walk(mypath):
ignored_files.remove_ignored_folders(root, dirs)
for filename in files:
fpath = os.path.join(root, filename)
if ignored_files.match(fpath, is_dir=False):
continue
...
"""
import fnmatch
import os
import re
class IgnoredFiles(object):
"""
Understand ignored files/folders (from .gitignore and/or .svnignore)
Allows to avoid walking through build and config folders
"""
def __init__(self, path, basename='.gitignore', use_default_ignores=True):
"""
:param str path: Path to folder we want to know ignore
:param str basename: Basename of the "ignore file"
:param bool use_default_ignores: Auto-add all usual ignores? (ie: .git/ .gradle/ etc)
"""
self.root = os.path.abspath(path)
self.invalid = []
self._regexes = set()
self.basename = basename
self.parse_gitignore(os.path.join(self.root, basename))
if use_default_ignores:
self.add('.git/')
self.add('.gradle/')
self.add('.idea/')
self.add('acl/')
self.add('build/')
self.add('dist/')
def __repr__(self):
return '%s ignores, %s invalid' % (len(self._regexes), len(self.invalid))
def __len__(self):
return len(self._regexes)
def match(self, path, is_dir=None):
"""
:param str path: Full path to file
:param bool|None is_dir: If applicable, caller can indicate wether 'full_path' is a directory or not (to save a file stat call)
:return IgnorePattern|None: Pattern that leads to 'path' being ignored, if any
"""
for regex in self._regexes:
if regex.match(path, is_dir=is_dir):
return regex
return None
def show_ignores(self):
"""
Useful for debugging, show which files would be ignored in self.root, and why (due to which pattern)
"""
result = []
for root, dirs, files in os.walk(self.root):
for basename in dirs[:]:
fpath = os.path.join(root, basename)
relative_path = fpath[len(self.root) + 1:]
m = self.match(fpath, is_dir=True)
if m:
dirs.remove(basename)
result.append("%-30s: %s" % (m.description, relative_path))
for basename in files:
fpath = os.path.join(root, basename)
relative_path = fpath[len(self.root) + 1:]
m = self.match(fpath, is_dir=False)
if m:
result.append("%-30s: %s" % (m.description, relative_path))
return '\n'.join(result)
def parse_gitignore(self, path):
"""
Add ignores as defined in .gitignore file with 'path'
:param str path: Path to .gitignore file
"""
try:
if not os.path.exists(path):
self.invalid.append("No folder %s" % path)
return
line_number = 0
with open(path) as fh:
for line in fh:
line_number += 1
line = line.strip()
if not line or line[0] == '#':
continue
if line[0] == '!':
# Negation patterns not supported... if .gitignore is uber crazy, then too bad, we possibly won't find all .py files in such esoteric setups
self.invalid.append("Negation pattern line %s not supported" % line_number)
continue
self.add(line, line_number)
except Exception as e:
self.invalid.append('Crashed: %s' % e)
def remove_ignored_folders(self, root, dirs):
"""
Remove all 'names' that should be ignored, handy for use with os.walk
:param str root: Parent of 'dirs'
:param list dirs: List of dirs to remove ignored folders (as defined in this object) from
"""
for basename in dirs[:]:
if self.match(os.path.join(root, basename), is_dir=True):
dirs.remove(basename)
def add(self, pattern, line_number=0):
"""
:param str pattern: Pattern to ignore
:param int line_number: Line number in .gitignore file
"""
pat = IgnorePattern(self.root, pattern, self.basename, line_number)
if pat.invalid:
self.invalid.append(pat)
else:
self._regexes.add(pat)
class IgnorePattern(object):
"""
Represents a .gitignore pattern, compatible with https://git-scm.com/docs/gitignore
"""
def __init__(self, root, pattern, basename, line_number):
"""
:param str root: Folder containing ignore file
:param str pattern: Pattern from ignore file
:param str basename: Basename of ignore file
:param int line_number: Line number in ignore file
"""
self.root = root
self.pattern = pattern
self.description = '%s:%2s:%s' % (basename, line_number, pattern)
self.invalid = None
self.applies_to_directories = False # When True, this pattern applies to directories only (not files or symlinks)
self.match_basename = False # When True, match against filename (otherwise: relative path)
self.exact_match = None # Exact string to match (no regex needed)
self.regex = None # Regex to use
if pattern.endswith('/') and not pattern.endswith('*/'):
# Anything ending with '/' simply means pattern applies to directories only
self.applies_to_directories = True
pattern = pattern[:-1]
if pattern.startswith('**/'):
pattern = pattern[3:]
if self._has_glob(pattern):
self.invalid = "Too complex"
return
if not pattern:
self.match_basename = True
self.regex = re.compile('.*')
elif '/' in pattern:
# **/foo/bar
self.match_basename = False
self.regex = re.compile('.*/%s' % re.escape(pattern))
else:
# **/foo is the same as ignoring basename foo
self.match_basename = True
self.exact_match = pattern
return
if pattern.endswith('/**'):
pattern = pattern[:-3]
if self._has_glob(pattern):
self.invalid = "Too complex"
return
self.applies_to_directories = True
if not pattern:
self.match_basename = True
self.regex = re.compile('.*')
return
if '/**/' in pattern:
first, _, second = pattern.partition('/**/')
if self._has_glob(first + second):
self.invalid = "Too complex"
return
# Provide regex representing "foo/**/bar"
self.regex = re.compile('%s(/.*/|/)?%s' % (re.escape(first), re.escape(second)))
return
if '**' in pattern:
self.invalid = "Not supported"
return
if pattern.startswith('/'):
# We're doing matching
self.match_basename = False
pattern = pattern[1:]
elif '/' not in pattern:
self.match_basename = True
if self._has_glob(pattern):
pattern = fnmatch.translate(pattern)
self.regex = re.compile(pattern)
return
self.exact_match = pattern
def __repr__(self):
return self.pattern
def _has_glob(self, pattern):
"""
:return bool: True if pattern have a shell glob (that can be turned into a regex by fnmatch.translate())
"""
return '*' in pattern or '?' in pattern
def match(self, full_path, is_dir=None):
"""
:param str full_path: Full path to file or folder
:param bool|None is_dir: If applicable, caller can indicate wether 'full_path' is a directory or not (to save a file stat call)
:return bool: True if 'full_path' is an ignore-match by this pattern, False otherwise
"""
assert not self.invalid
if self.applies_to_directories:
if is_dir is None:
is_dir = os.path.isdir(full_path)
if not is_dir:
return False
if self.match_basename:
name = os.path.basename(full_path)
else:
name = full_path[len(self.root) + 1:]
if self.exact_match:
return name == self.exact_match
assert self.regex
return self.regex.match(name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment