Skip to content

Instantly share code, notes, and snippets.

@danlmyers
Created March 7, 2015 03:18
Show Gist options
  • Save danlmyers/73a0e6aa42d987437d7e to your computer and use it in GitHub Desktop.
Save danlmyers/73a0e6aa42d987437d7e to your computer and use it in GitHub Desktop.
MAGIC_NUMBERS = {
# List of magic numbers to determine file types.
# A partial Listing is available at: http://en.wikipedia.org/wiki/List_of_file_signatures
'zip': {'numbers': ['\x50\x4B\x03\x04'], 'offset': 0},
'gz': {'numbers': ['\x1F\x8B\x08'], 'offset': 0},
'bz2': {'numbers': ['\x42\x5A\x68'], 'offset': 0},
'tar': {'numbers': ['\x75\x73\x74\x61\x72\x00\x30\x30', '\x75\x73\x74\x61\x72\x20\x20\x00'], 'offset': 257},
'rar': {'numbers': ['\x52\x61\x72\x21\x1A\x07\x00', '\x52\x61\x72\x21\x1A\x07\x01\x00'], 'offset': 0},
'7z': {'numbers': ['\x37\x7A\xBC\xAF\x27\x1C'], 'offset': 0},
'Z': {'numbers': ['\x1F\x9D'], 'offset': 0}
}
def determine_filetype(target_file):
"""
Reads the headers of a file and determines the file type based on the headers.
:param target_file: File to check what the file type is
:return: Short name of the type of file, like gz for gzipped archives, bz2 for bzipped archives. Doesn't make any
inferences to what is contained in the file in cases of archives, for example a tar.gz file will return
that it is a gzip archive, but won't know that there is a tar inside of it. Possible Returns: False,
apk, docx, jar, odp, ods, odt, pptx, xlsx, zip, gz, bz2, tar, rar, 7z, Z (as configured in MAGIC_NUMBERS)
"""
if not os.path.isfile(target_file):
# Not a regular file, don't bother.
return False
alternate_zips = ['apk', 'docx', 'jar', 'odp', 'ods', 'odt', 'pptx', 'xlsx', 'zipx']
magic_number_lengths = []
header_offsets = []
for file_type in MAGIC_NUMBERS:
header_offsets.append(MAGIC_NUMBERS[file_type]['offset'])
for number in MAGIC_NUMBERS[file_type]['numbers']:
magic_number_lengths.append(len(number))
header_length = max(magic_number_lengths) + max(header_offsets)
with open(target_file) as raw_file:
headers = raw_file.read(header_length)
for file_type in MAGIC_NUMBERS:
for magic in MAGIC_NUMBERS[file_type]['numbers']:
if headers[MAGIC_NUMBERS[file_type]['offset']:].startswith(magic):
if file_type == 'zip':
file_extension = os.path.splitext(target_file)[1][1:]
if file_extension in alternate_zips:
return file_extension
return file_type
return False # No filetypes matched.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment