Created
May 1, 2022 11:49
-
-
Save jul/32b487e36ea4b22b1b0825e770d0d5b8 to your computer and use it in GitHub Desktop.
minimalist magic detection for fun
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from requests import get | |
import re | |
''' | |
sometimes, you want a minimalistic magic number detection for simple file types. | |
Like for instance, you scrap the web and you don't trust the web server when returning the mime type | |
(actually happened to me on bilbliothèque nationale de france more than once) | |
So, yo want to make sure that for instance, the returned content is atcually a jpeg, and you may not be | |
on a unix OS with libmagic and you may not want to add dependencies and ... | |
just comparing 3 bytes in a row in a raw byte stream is not THAT hard | |
''' | |
""" How to build the dict not for the faint of hear by parsing a random web page | |
like for magic numbers html should not be parsed with regular expression but sometimes it does | |
the job without requiring the mental overload to remember how to use a specific parser (lxml). | |
In production code where data should be quaranteed to be valid, use a parser, mmok? | |
""" | |
#content = get("https://gist.githubusercontent.com/leommoore/f9e57ba2aa4bf197ebc5/raw/e59c296951e0588509b1f777d1f98b2ce08272ad/file_magic_numbers.md").content.decode("utf8") | |
#f = re.compile("""<tr>\s*<td>(?P<type>.+)</td>\s*<td>[^<]+</td>\s*<td>(?P<magic>[a-f0-9\ ]+)</td>""",re.MULTILINE|re.VERBOSE| re.I) | |
#magic_to_type = dict(map(lambda t: (tuple(map(lambda st:int(st,16), t[1].split(" "))), t[0]), s.findall(content.decode("utf8")))) | |
magic_to_type = {(66, 77): 'Bitmap format', | |
(83, 73, 77, 80, 76, 69): 'FITS format', | |
(71, 73, 70, 56): 'GIF format', | |
(71, 75, 83, 77): 'Graphics Kernel System', | |
(1, 218): 'IRIS rgb format', | |
(241, 0, 64, 187): 'ITC (CMU WM) format', | |
(255, 216, 255, 224): 'JPEG File Interchange Format', | |
(73, 73, 78, 49): 'NIFF (Navy TIFF)', | |
(86, 73, 69, 87): 'PM format', | |
(137, 80, 78, 71): 'PNG format', | |
(37, 33): 'Postscript format', | |
(89, 166, 106, 149): 'Sun Rasterfile', | |
(77, 77, 0, 42): 'TIFF format (Motorola - big endian) ', | |
(73, 73, 42, 0): 'TIFF format (Intel - little endian) ', | |
(103, 105, 109, 112, 32, 120, 99, 102, 32, 118): 'XCF Gimp file structure', | |
(35, 70, 73, 71): 'Xfig format', | |
(47, 42, 32, 88, 80, 77, 32, 42, 47): 'XPM format', | |
(66, 90): 'Bzip', | |
(31, 157): 'Compress', | |
(31, 139): 'gzip format', | |
(80, 75, 3, 4): 'pkzip format', | |
(117, 115, 116, 97, 114): 'TAR (POSIX)', | |
(77, 90): 'MS-DOS, OS/2 or MS Windows', | |
(127, 69, 76, 70): 'Unix elf', | |
(153, 0): 'pgp public ring', | |
(149, 1): 'pgp security ring', | |
(149, 0): 'pgp security ring', | |
(166, 0): 'pgp encrypted data'} | |
def magic_detect(a_byte_array): | |
"""very unefficient very partial magic number detection, but sometimes | |
efficiency really does not matter""" | |
for pattern, name in magic_to_type.items(): | |
to_read=len(pattern) | |
if len(a_byte_array) < to_read: continue | |
if tuple(a_byte_array[:to_read]) == pattern: | |
return name | |
print(magic_detect(get("https://www.python.org/static/img/python-logo.png").content)) | |
print(magic_detect(open("/usr/lib/x86_64-linux-gnu/libelf.so.1","rb").read(129))) | |
print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100))) | |
magic_to_type[( 0x61, 0x0d,0x0d, 0x0a)] = "python 3.9 byte-compiled" | |
magic_to_type[( 0x33, 0x0d, 0x0d,0x0a)] = "python 3.6 byte-compiled" | |
print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100))) | |
print(magic_detect(open("__pycache__/get_magic.cpython-36.pyc", "rb").read(100))) | |
magic_to_type[tuple(map(ord,"#!"))] = "Shebang unix script" | |
print(magic_detect(open("get_magic.py", "rb").read(100))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment