chriskuehl/classifier.py Secret

## classifier.py
import io
import re
from itertools import chain
from os.path import splitext

GIT_MODE_FILE = 0o100644
GIT_MODE_EXECUTABLE = 0o100755
GIT_MODE_SYMLINK = 0o120000
GIT_MODE_SUBMODULE = 0o160000

KNOWN_EXTENSIONS = {
    '^js$': ['javascript'],
    '^json$': ['json'],
    '^py$': ['python'],
}

KNOWN_INTERPRETERS = {
    '^python([23](\.[0-9]+)?)?$': ['python'],
    '^(ba|da)?sh$': ['shell'],
    '^ruby$': ['ruby'],
    '^node(js)?$': ['javascript'],
}


def classify(path, mode):
    """Return a list of tags for a file.

    :param path: path to the file
    :param mode: Git mode of the file
    :return: list of tags
    """
    tags = set()

    if mode in (GIT_MODE_FILE, GIT_MODE_EXECUTABLE):
        if _is_binary(path):
            tags.add('binary')
        else:
            tags.add('text')
            tags.update(_guess_types_from_shebang(path))

        if mode == GIT_MODE_EXECUTABLE:
            tags.add('executable')
        else:
            tags.add('nonexecutable')

        tags.update(_guess_types_from_extension(path))
    elif mode == GIT_MODE_SYMLINK:
        tags.add('symlink')
    elif mode == GIT_MODE_SUBMODULE:
        tags.add('submodule')

    return tags


def _guess_types_from_extension(path):
    """Guess types for a file based on extension."""
    __, ext = splitext(path)
    return _types_from_regex_dict(KNOWN_EXTENSIONS, ext[1:])


def _is_binary(path):
    """Return whether the file seems to be binary.

    This is roughly based on libmagic's binary/text detection:
    https://github.com/file/file/blob/master/src/encoding.c#L203-L228
    """
    text_chars = (
        bytearray([7, 8, 9, 10, 12, 13, 27]) +
        bytearray(range(0x20, 0x7F)) +
        bytearray(range(0x80, 0X100))
    )
    with io.open(path, 'rb') as f:
        b = f.read(1024)  # only read first KB
    return bool(b.translate(None, text_chars))


def _guess_types_from_shebang(path):
    """Guess types for a text file based on shebang.

    :param path: path to text file
    :return: list of guessed types, possibly empty
    """
    interpreter = _read_shebang(path)
    if interpreter:
        words = interpreter.split(' ')
        name = words[0].split('/')[-1]
        if name.endswith('/env') and len(words) == 2:
            # special case: #!/something/env <real_interpreter>
            name = words[1]
        return _types_from_regex_dict(KNOWN_INTERPRETERS, name)
    return []


def _read_shebang(path):
    """Read a shebang from a file.

    The first line of a script is guaranteed to be ASCII, so we read ASCII
    until we hit a newline (at which point we check if we read a valid shebang)
    or a non-ASCII character (at which point we bail).

    :param path: path to text file
    :return: interpreter (part after #!), or None if no shebang could be read
    """
    MAX_SHEBANG_LENGTH = 128  # Linux kernel limit on shebangs

    with io.open(path, 'rb') as f:
        bytes_read = f.read(MAX_SHEBANG_LENGTH)

    chars_read = ''
    for i in range(MAX_SHEBANG_LENGTH):
        try:
            char = bytes_read[i:i+1].decode('ascii')
        except UnicodeDecodeError:
            return None  # no valid shebang

        if char != '\n':
            chars_read += char
        else:
            break

    if chars_read.startswith('#!'):
        return chars_read[2:].strip()


def _types_from_regex_dict(dict, query):
    """Fetch values from a dictionary with regex keys.

    If a key matches multiple regexes, they will be chained together.

    :param dict:
    :param query: key to match
    :return: concatenated values from dict, or default
    """
    return chain.from_iterable(
        value for key, value in dict.items() if re.match(key, query)
    )


if __name__ == '__main__':
    tests = [
        ('/etc/passwd', GIT_MODE_FILE),
        ('/bin/bash', GIT_MODE_EXECUTABLE),
        ('/dev/urandom', GIT_MODE_FILE),
        ('/dev/null', GIT_MODE_FILE),
        ('/usr/lib/libdb-4.6.so', GIT_MODE_FILE),
        ('/usr/share/ca-certificates/mozilla/GeoTrust_Global_CA.crt', GIT_MODE_FILE),
        ('/usr/bin/reportbug', GIT_MODE_EXECUTABLE),
        ('/usr/lib/python3.4/contextlib.py', GIT_MODE_FILE),
        ('/home/c/ck/ckuehl/ocf-proj/puppet/modules/apache', GIT_MODE_SUBMODULE),
        ('/usr/bin/java', GIT_MODE_SYMLINK),
    ]

    for path, mode in tests:
        print('{path}: {tags}'.format(
            path=path,
            tags=classify(path, mode)
        ))
	import io
	import re
	from itertools import chain
	from os.path import splitext

	GIT_MODE_FILE = 0o100644
	GIT_MODE_EXECUTABLE = 0o100755
	GIT_MODE_SYMLINK = 0o120000
	GIT_MODE_SUBMODULE = 0o160000

	KNOWN_EXTENSIONS = {
	'^js$': ['javascript'],
	'^json$': ['json'],
	'^py$': ['python'],
	}

	KNOWN_INTERPRETERS = {
	'^python([23](\.[0-9]+)?)?$': ['python'],
	'^(ba\|da)?sh$': ['shell'],
	'^ruby$': ['ruby'],
	'^node(js)?$': ['javascript'],
	}


	def classify(path, mode):
	"""Return a list of tags for a file.

	:param path: path to the file
	:param mode: Git mode of the file
	:return: list of tags
	"""
	tags = set()

	if mode in (GIT_MODE_FILE, GIT_MODE_EXECUTABLE):
	if _is_binary(path):
	tags.add('binary')
	else:
	tags.add('text')
	tags.update(_guess_types_from_shebang(path))

	if mode == GIT_MODE_EXECUTABLE:
	tags.add('executable')
	else:
	tags.add('nonexecutable')

	tags.update(_guess_types_from_extension(path))
	elif mode == GIT_MODE_SYMLINK:
	tags.add('symlink')
	elif mode == GIT_MODE_SUBMODULE:
	tags.add('submodule')

	return tags


	def _guess_types_from_extension(path):
	"""Guess types for a file based on extension."""
	__, ext = splitext(path)
	return _types_from_regex_dict(KNOWN_EXTENSIONS, ext[1:])


	def _is_binary(path):
	"""Return whether the file seems to be binary.

	This is roughly based on libmagic's binary/text detection:
	https://github.com/file/file/blob/master/src/encoding.c#L203-L228
	"""
	text_chars = (
	bytearray([7, 8, 9, 10, 12, 13, 27]) +
	bytearray(range(0x20, 0x7F)) +
	bytearray(range(0x80, 0X100))
	)
	with io.open(path, 'rb') as f:
	b = f.read(1024) # only read first KB
	return bool(b.translate(None, text_chars))


	def _guess_types_from_shebang(path):
	"""Guess types for a text file based on shebang.

	:param path: path to text file
	:return: list of guessed types, possibly empty
	"""
	interpreter = _read_shebang(path)
	if interpreter:
	words = interpreter.split(' ')
	name = words[0].split('/')[-1]
	if name.endswith('/env') and len(words) == 2:
	# special case: #!/something/env <real_interpreter>
	name = words[1]
	return _types_from_regex_dict(KNOWN_INTERPRETERS, name)
	return []


	def _read_shebang(path):
	"""Read a shebang from a file.

	The first line of a script is guaranteed to be ASCII, so we read ASCII
	until we hit a newline (at which point we check if we read a valid shebang)
	or a non-ASCII character (at which point we bail).

	:param path: path to text file
	:return: interpreter (part after #!), or None if no shebang could be read
	"""
	MAX_SHEBANG_LENGTH = 128 # Linux kernel limit on shebangs

	with io.open(path, 'rb') as f:
	bytes_read = f.read(MAX_SHEBANG_LENGTH)

	chars_read = ''
	for i in range(MAX_SHEBANG_LENGTH):
	try:
	char = bytes_read[i:i+1].decode('ascii')
	except UnicodeDecodeError:
	return None # no valid shebang

	if char != '\n':
	chars_read += char
	else:
	break

	if chars_read.startswith('#!'):
	return chars_read[2:].strip()


	def _types_from_regex_dict(dict, query):
	"""Fetch values from a dictionary with regex keys.

	If a key matches multiple regexes, they will be chained together.

	:param dict:
	:param query: key to match
	:return: concatenated values from dict, or default
	"""
	return chain.from_iterable(
	value for key, value in dict.items() if re.match(key, query)
	)


	if __name__ == '__main__':
	tests = [
	('/etc/passwd', GIT_MODE_FILE),
	('/bin/bash', GIT_MODE_EXECUTABLE),
	('/dev/urandom', GIT_MODE_FILE),
	('/dev/null', GIT_MODE_FILE),
	('/usr/lib/libdb-4.6.so', GIT_MODE_FILE),
	('/usr/share/ca-certificates/mozilla/GeoTrust_Global_CA.crt', GIT_MODE_FILE),
	('/usr/bin/reportbug', GIT_MODE_EXECUTABLE),
	('/usr/lib/python3.4/contextlib.py', GIT_MODE_FILE),
	('/home/c/ck/ckuehl/ocf-proj/puppet/modules/apache', GIT_MODE_SUBMODULE),
	('/usr/bin/java', GIT_MODE_SYMLINK),
	]

	for path, mode in tests:
	print('{path}: {tags}'.format(
	path=path,
	tags=classify(path, mode)
	))