Skip to content

Instantly share code, notes, and snippets.

@chriskuehl
Created August 6, 2015 06:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chriskuehl/dc4a8232ac77e80f4c7c to your computer and use it in GitHub Desktop.
Save chriskuehl/dc4a8232ac77e80f4c7c to your computer and use it in GitHub Desktop.
import io
import re
from itertools import chain
from os.path import splitext
GIT_MODE_FILE = 0o100644
GIT_MODE_EXECUTABLE = 0o100755
GIT_MODE_SYMLINK = 0o120000
GIT_MODE_SUBMODULE = 0o160000
KNOWN_EXTENSIONS = {
'^js$': ['javascript'],
'^json$': ['json'],
'^py$': ['python'],
}
KNOWN_INTERPRETERS = {
'^python([23](\.[0-9]+)?)?$': ['python'],
'^(ba|da)?sh$': ['shell'],
'^ruby$': ['ruby'],
'^node(js)?$': ['javascript'],
}
def classify(path, mode):
"""Return a list of tags for a file.
:param path: path to the file
:param mode: Git mode of the file
:return: list of tags
"""
tags = set()
if mode in (GIT_MODE_FILE, GIT_MODE_EXECUTABLE):
if _is_binary(path):
tags.add('binary')
else:
tags.add('text')
tags.update(_guess_types_from_shebang(path))
if mode == GIT_MODE_EXECUTABLE:
tags.add('executable')
else:
tags.add('nonexecutable')
tags.update(_guess_types_from_extension(path))
elif mode == GIT_MODE_SYMLINK:
tags.add('symlink')
elif mode == GIT_MODE_SUBMODULE:
tags.add('submodule')
return tags
def _guess_types_from_extension(path):
"""Guess types for a file based on extension."""
__, ext = splitext(path)
return _types_from_regex_dict(KNOWN_EXTENSIONS, ext[1:])
def _is_binary(path):
"""Return whether the file seems to be binary.
This is roughly based on libmagic's binary/text detection:
https://github.com/file/file/blob/master/src/encoding.c#L203-L228
"""
text_chars = (
bytearray([7, 8, 9, 10, 12, 13, 27]) +
bytearray(range(0x20, 0x7F)) +
bytearray(range(0x80, 0X100))
)
with io.open(path, 'rb') as f:
b = f.read(1024) # only read first KB
return bool(b.translate(None, text_chars))
def _guess_types_from_shebang(path):
"""Guess types for a text file based on shebang.
:param path: path to text file
:return: list of guessed types, possibly empty
"""
interpreter = _read_shebang(path)
if interpreter:
words = interpreter.split(' ')
name = words[0].split('/')[-1]
if name.endswith('/env') and len(words) == 2:
# special case: #!/something/env <real_interpreter>
name = words[1]
return _types_from_regex_dict(KNOWN_INTERPRETERS, name)
return []
def _read_shebang(path):
"""Read a shebang from a file.
The first line of a script is guaranteed to be ASCII, so we read ASCII
until we hit a newline (at which point we check if we read a valid shebang)
or a non-ASCII character (at which point we bail).
:param path: path to text file
:return: interpreter (part after #!), or None if no shebang could be read
"""
MAX_SHEBANG_LENGTH = 128 # Linux kernel limit on shebangs
with io.open(path, 'rb') as f:
bytes_read = f.read(MAX_SHEBANG_LENGTH)
chars_read = ''
for i in range(MAX_SHEBANG_LENGTH):
try:
char = bytes_read[i:i+1].decode('ascii')
except UnicodeDecodeError:
return None # no valid shebang
if char != '\n':
chars_read += char
else:
break
if chars_read.startswith('#!'):
return chars_read[2:].strip()
def _types_from_regex_dict(dict, query):
"""Fetch values from a dictionary with regex keys.
If a key matches multiple regexes, they will be chained together.
:param dict:
:param query: key to match
:return: concatenated values from dict, or default
"""
return chain.from_iterable(
value for key, value in dict.items() if re.match(key, query)
)
if __name__ == '__main__':
tests = [
('/etc/passwd', GIT_MODE_FILE),
('/bin/bash', GIT_MODE_EXECUTABLE),
('/dev/urandom', GIT_MODE_FILE),
('/dev/null', GIT_MODE_FILE),
('/usr/lib/libdb-4.6.so', GIT_MODE_FILE),
('/usr/share/ca-certificates/mozilla/GeoTrust_Global_CA.crt', GIT_MODE_FILE),
('/usr/bin/reportbug', GIT_MODE_EXECUTABLE),
('/usr/lib/python3.4/contextlib.py', GIT_MODE_FILE),
('/home/c/ck/ckuehl/ocf-proj/puppet/modules/apache', GIT_MODE_SUBMODULE),
('/usr/bin/java', GIT_MODE_SYMLINK),
]
for path, mode in tests:
print('{path}: {tags}'.format(
path=path,
tags=classify(path, mode)
))
@asottile
Copy link

asottile commented Aug 7, 2015

https://gist.github.com/chriskuehl/dc4a8232ac77e80f4c7c#file-classifier-py-L104
Others have higher limits (cygwin iirc has the highest limit)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment