-
-
Save chriskuehl/dc4a8232ac77e80f4c7c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import re | |
from itertools import chain | |
from os.path import splitext | |
GIT_MODE_FILE = 0o100644 | |
GIT_MODE_EXECUTABLE = 0o100755 | |
GIT_MODE_SYMLINK = 0o120000 | |
GIT_MODE_SUBMODULE = 0o160000 | |
KNOWN_EXTENSIONS = { | |
'^js$': ['javascript'], | |
'^json$': ['json'], | |
'^py$': ['python'], | |
} | |
KNOWN_INTERPRETERS = { | |
'^python([23](\.[0-9]+)?)?$': ['python'], | |
'^(ba|da)?sh$': ['shell'], | |
'^ruby$': ['ruby'], | |
'^node(js)?$': ['javascript'], | |
} | |
def classify(path, mode): | |
"""Return a list of tags for a file. | |
:param path: path to the file | |
:param mode: Git mode of the file | |
:return: list of tags | |
""" | |
tags = set() | |
if mode in (GIT_MODE_FILE, GIT_MODE_EXECUTABLE): | |
if _is_binary(path): | |
tags.add('binary') | |
else: | |
tags.add('text') | |
tags.update(_guess_types_from_shebang(path)) | |
if mode == GIT_MODE_EXECUTABLE: | |
tags.add('executable') | |
else: | |
tags.add('nonexecutable') | |
tags.update(_guess_types_from_extension(path)) | |
elif mode == GIT_MODE_SYMLINK: | |
tags.add('symlink') | |
elif mode == GIT_MODE_SUBMODULE: | |
tags.add('submodule') | |
return tags | |
def _guess_types_from_extension(path): | |
"""Guess types for a file based on extension.""" | |
__, ext = splitext(path) | |
return _types_from_regex_dict(KNOWN_EXTENSIONS, ext[1:]) | |
def _is_binary(path): | |
"""Return whether the file seems to be binary. | |
This is roughly based on libmagic's binary/text detection: | |
https://github.com/file/file/blob/master/src/encoding.c#L203-L228 | |
""" | |
text_chars = ( | |
bytearray([7, 8, 9, 10, 12, 13, 27]) + | |
bytearray(range(0x20, 0x7F)) + | |
bytearray(range(0x80, 0X100)) | |
) | |
with io.open(path, 'rb') as f: | |
b = f.read(1024) # only read first KB | |
return bool(b.translate(None, text_chars)) | |
def _guess_types_from_shebang(path): | |
"""Guess types for a text file based on shebang. | |
:param path: path to text file | |
:return: list of guessed types, possibly empty | |
""" | |
interpreter = _read_shebang(path) | |
if interpreter: | |
words = interpreter.split(' ') | |
name = words[0].split('/')[-1] | |
if name.endswith('/env') and len(words) == 2: | |
# special case: #!/something/env <real_interpreter> | |
name = words[1] | |
return _types_from_regex_dict(KNOWN_INTERPRETERS, name) | |
return [] | |
def _read_shebang(path): | |
"""Read a shebang from a file. | |
The first line of a script is guaranteed to be ASCII, so we read ASCII | |
until we hit a newline (at which point we check if we read a valid shebang) | |
or a non-ASCII character (at which point we bail). | |
:param path: path to text file | |
:return: interpreter (part after #!), or None if no shebang could be read | |
""" | |
MAX_SHEBANG_LENGTH = 128 # Linux kernel limit on shebangs | |
with io.open(path, 'rb') as f: | |
bytes_read = f.read(MAX_SHEBANG_LENGTH) | |
chars_read = '' | |
for i in range(MAX_SHEBANG_LENGTH): | |
try: | |
char = bytes_read[i:i+1].decode('ascii') | |
except UnicodeDecodeError: | |
return None # no valid shebang | |
if char != '\n': | |
chars_read += char | |
else: | |
break | |
if chars_read.startswith('#!'): | |
return chars_read[2:].strip() | |
def _types_from_regex_dict(dict, query): | |
"""Fetch values from a dictionary with regex keys. | |
If a key matches multiple regexes, they will be chained together. | |
:param dict: | |
:param query: key to match | |
:return: concatenated values from dict, or default | |
""" | |
return chain.from_iterable( | |
value for key, value in dict.items() if re.match(key, query) | |
) | |
if __name__ == '__main__': | |
tests = [ | |
('/etc/passwd', GIT_MODE_FILE), | |
('/bin/bash', GIT_MODE_EXECUTABLE), | |
('/dev/urandom', GIT_MODE_FILE), | |
('/dev/null', GIT_MODE_FILE), | |
('/usr/lib/libdb-4.6.so', GIT_MODE_FILE), | |
('/usr/share/ca-certificates/mozilla/GeoTrust_Global_CA.crt', GIT_MODE_FILE), | |
('/usr/bin/reportbug', GIT_MODE_EXECUTABLE), | |
('/usr/lib/python3.4/contextlib.py', GIT_MODE_FILE), | |
('/home/c/ck/ckuehl/ocf-proj/puppet/modules/apache', GIT_MODE_SUBMODULE), | |
('/usr/bin/java', GIT_MODE_SYMLINK), | |
] | |
for path, mode in tests: | |
print('{path}: {tags}'.format( | |
path=path, | |
tags=classify(path, mode) | |
)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gist.github.com/chriskuehl/dc4a8232ac77e80f4c7c#file-classifier-py-L104
Others have higher limits (cygwin iirc has the highest limit)