Skip to content

Instantly share code, notes, and snippets.

@jul
Created May 1, 2022 11:49
Show Gist options
  • Save jul/32b487e36ea4b22b1b0825e770d0d5b8 to your computer and use it in GitHub Desktop.
Save jul/32b487e36ea4b22b1b0825e770d0d5b8 to your computer and use it in GitHub Desktop.
minimalist magic detection for fun
#!/usr/bin/env python3
from requests import get
import re
'''
sometimes, you want a minimalistic magic number detection for simple file types.
Like for instance, you scrap the web and you don't trust the web server when returning the mime type
(actually happened to me on bilbliothèque nationale de france more than once)
So, yo want to make sure that for instance, the returned content is atcually a jpeg, and you may not be
on a unix OS with libmagic and you may not want to add dependencies and ...
just comparing 3 bytes in a row in a raw byte stream is not THAT hard
'''
""" How to build the dict not for the faint of hear by parsing a random web page
like for magic numbers html should not be parsed with regular expression but sometimes it does
the job without requiring the mental overload to remember how to use a specific parser (lxml).
In production code where data should be quaranteed to be valid, use a parser, mmok?
"""
#content = get("https://gist.githubusercontent.com/leommoore/f9e57ba2aa4bf197ebc5/raw/e59c296951e0588509b1f777d1f98b2ce08272ad/file_magic_numbers.md").content.decode("utf8")
#f = re.compile("""<tr>\s*<td>(?P<type>.+)</td>\s*<td>[^<]+</td>\s*<td>(?P<magic>[a-f0-9\ ]+)</td>""",re.MULTILINE|re.VERBOSE| re.I)
#magic_to_type = dict(map(lambda t: (tuple(map(lambda st:int(st,16), t[1].split(" "))), t[0]), s.findall(content.decode("utf8"))))
magic_to_type = {(66, 77): 'Bitmap format',
(83, 73, 77, 80, 76, 69): 'FITS format',
(71, 73, 70, 56): 'GIF format',
(71, 75, 83, 77): 'Graphics Kernel System',
(1, 218): 'IRIS rgb format',
(241, 0, 64, 187): 'ITC (CMU WM) format',
(255, 216, 255, 224): 'JPEG File Interchange Format',
(73, 73, 78, 49): 'NIFF (Navy TIFF)',
(86, 73, 69, 87): 'PM format',
(137, 80, 78, 71): 'PNG format',
(37, 33): 'Postscript format',
(89, 166, 106, 149): 'Sun Rasterfile',
(77, 77, 0, 42): 'TIFF format (Motorola - big endian) ',
(73, 73, 42, 0): 'TIFF format (Intel - little endian) ',
(103, 105, 109, 112, 32, 120, 99, 102, 32, 118): 'XCF Gimp file structure',
(35, 70, 73, 71): 'Xfig format',
(47, 42, 32, 88, 80, 77, 32, 42, 47): 'XPM format',
(66, 90): 'Bzip',
(31, 157): 'Compress',
(31, 139): 'gzip format',
(80, 75, 3, 4): 'pkzip format',
(117, 115, 116, 97, 114): 'TAR (POSIX)',
(77, 90): 'MS-DOS, OS/2 or MS Windows',
(127, 69, 76, 70): 'Unix elf',
(153, 0): 'pgp public ring',
(149, 1): 'pgp security ring',
(149, 0): 'pgp security ring',
(166, 0): 'pgp encrypted data'}
def magic_detect(a_byte_array):
"""very unefficient very partial magic number detection, but sometimes
efficiency really does not matter"""
for pattern, name in magic_to_type.items():
to_read=len(pattern)
if len(a_byte_array) < to_read: continue
if tuple(a_byte_array[:to_read]) == pattern:
return name
print(magic_detect(get("https://www.python.org/static/img/python-logo.png").content))
print(magic_detect(open("/usr/lib/x86_64-linux-gnu/libelf.so.1","rb").read(129)))
print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100)))
magic_to_type[( 0x61, 0x0d,0x0d, 0x0a)] = "python 3.9 byte-compiled"
magic_to_type[( 0x33, 0x0d, 0x0d,0x0a)] = "python 3.6 byte-compiled"
print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100)))
print(magic_detect(open("__pycache__/get_magic.cpython-36.pyc", "rb").read(100)))
magic_to_type[tuple(map(ord,"#!"))] = "Shebang unix script"
print(magic_detect(open("get_magic.py", "rb").read(100)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment