jul/get_magic.py

## get_magic.py
#!/usr/bin/env python3
from requests import get
import re
'''
sometimes, you want a minimalistic magic number detection for simple file types.
Like for instance, you scrap the web and you don't trust the web server when returning the mime type
(actually happened to me on bilbliothèque nationale de france more than once)
So, yo want to make sure that for instance, the returned content is atcually a jpeg, and you may not be
on a unix OS with libmagic and you may not want to add dependencies and ...
just comparing 3 bytes in a row in a raw byte stream is not THAT hard
'''

""" How to build the dict not for the faint of hear by parsing a random web page
like for magic numbers html should not be parsed with regular expression but sometimes it does
the job without requiring the mental overload to remember how to use a specific parser (lxml).
In production code where data should be quaranteed to be valid, use a parser, mmok?
"""
#content = get("https://gist.githubusercontent.com/leommoore/f9e57ba2aa4bf197ebc5/raw/e59c296951e0588509b1f777d1f98b2ce08272ad/file_magic_numbers.md").content.decode("utf8")

#f = re.compile("""<tr>\s*<td>(?P<type>.+)</td>\s*<td>[^<]+</td>\s*<td>(?P<magic>[a-f0-9\ ]+)</td>""",re.MULTILINE|re.VERBOSE| re.I)
#magic_to_type = dict(map(lambda t: (tuple(map(lambda st:int(st,16), t[1].split(" "))), t[0]),  s.findall(content.decode("utf8"))))

magic_to_type = {(66, 77): 'Bitmap format',
 (83, 73, 77, 80, 76, 69): 'FITS format',
 (71, 73, 70, 56): 'GIF format',
 (71, 75, 83, 77): 'Graphics Kernel System',
 (1, 218): 'IRIS rgb format',
 (241, 0, 64, 187): 'ITC (CMU WM) format',
 (255, 216, 255, 224): 'JPEG File Interchange Format',
 (73, 73, 78, 49): 'NIFF (Navy TIFF)',
 (86, 73, 69, 87): 'PM format',
 (137, 80, 78, 71): 'PNG format',
 (37, 33): 'Postscript format',
 (89, 166, 106, 149): 'Sun Rasterfile',
 (77, 77, 0, 42): 'TIFF format (Motorola - big endian) ',
 (73, 73, 42, 0): 'TIFF format (Intel - little endian) ',
 (103, 105, 109, 112, 32, 120, 99, 102, 32, 118): 'XCF Gimp file structure',
 (35, 70, 73, 71): 'Xfig format',
 (47, 42, 32, 88, 80, 77, 32, 42, 47): 'XPM format',
 (66, 90): 'Bzip',
 (31, 157): 'Compress',
 (31, 139): 'gzip format',
 (80, 75, 3, 4): 'pkzip format',
 (117, 115, 116, 97, 114): 'TAR (POSIX)',
 (77, 90): 'MS-DOS, OS/2 or MS Windows',
 (127, 69, 76, 70): 'Unix elf',
 (153, 0): 'pgp public ring',
 (149, 1): 'pgp security ring',
 (149, 0): 'pgp security ring',
 (166, 0): 'pgp encrypted data'}

def magic_detect(a_byte_array):
    """very unefficient very partial magic number detection, but sometimes
    efficiency really does not matter"""
    for pattern, name in magic_to_type.items():
        to_read=len(pattern)
        if len(a_byte_array) < to_read: continue
        if tuple(a_byte_array[:to_read]) == pattern:
            return name

print(magic_detect(get("https://www.python.org/static/img/python-logo.png").content))
print(magic_detect(open("/usr/lib/x86_64-linux-gnu/libelf.so.1","rb").read(129)))
print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100)))
magic_to_type[( 0x61, 0x0d,0x0d, 0x0a)] = "python 3.9 byte-compiled"
magic_to_type[( 0x33, 0x0d, 0x0d,0x0a)] = "python 3.6 byte-compiled"
print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100)))
print(magic_detect(open("__pycache__/get_magic.cpython-36.pyc", "rb").read(100)))
magic_to_type[tuple(map(ord,"#!"))] = "Shebang unix script"
print(magic_detect(open("get_magic.py", "rb").read(100)))
	#!/usr/bin/env python3
	from requests import get
	import re
	'''
	sometimes, you want a minimalistic magic number detection for simple file types.
	Like for instance, you scrap the web and you don't trust the web server when returning the mime type
	(actually happened to me on bilbliothèque nationale de france more than once)
	So, yo want to make sure that for instance, the returned content is atcually a jpeg, and you may not be
	on a unix OS with libmagic and you may not want to add dependencies and ...
	just comparing 3 bytes in a row in a raw byte stream is not THAT hard
	'''

	""" How to build the dict not for the faint of hear by parsing a random web page
	like for magic numbers html should not be parsed with regular expression but sometimes it does
	the job without requiring the mental overload to remember how to use a specific parser (lxml).
	In production code where data should be quaranteed to be valid, use a parser, mmok?
	"""
	#content = get("https://gist.githubusercontent.com/leommoore/f9e57ba2aa4bf197ebc5/raw/e59c296951e0588509b1f777d1f98b2ce08272ad/file_magic_numbers.md").content.decode("utf8")

	#f = re.compile("""<tr>\s<td>(?P<type>.+)</td>\s<td>[^<]+</td>\s*<td>(?P<magic>[a-f0-9\ ]+)</td>""",re.MULTILINE\|re.VERBOSE\| re.I)
	#magic_to_type = dict(map(lambda t: (tuple(map(lambda st:int(st,16), t[1].split(" "))), t[0]), s.findall(content.decode("utf8"))))

	magic_to_type = {(66, 77): 'Bitmap format',
	(83, 73, 77, 80, 76, 69): 'FITS format',
	(71, 73, 70, 56): 'GIF format',
	(71, 75, 83, 77): 'Graphics Kernel System',
	(1, 218): 'IRIS rgb format',
	(241, 0, 64, 187): 'ITC (CMU WM) format',
	(255, 216, 255, 224): 'JPEG File Interchange Format',
	(73, 73, 78, 49): 'NIFF (Navy TIFF)',
	(86, 73, 69, 87): 'PM format',
	(137, 80, 78, 71): 'PNG format',
	(37, 33): 'Postscript format',
	(89, 166, 106, 149): 'Sun Rasterfile',
	(77, 77, 0, 42): 'TIFF format (Motorola - big endian) ',
	(73, 73, 42, 0): 'TIFF format (Intel - little endian) ',
	(103, 105, 109, 112, 32, 120, 99, 102, 32, 118): 'XCF Gimp file structure',
	(35, 70, 73, 71): 'Xfig format',
	(47, 42, 32, 88, 80, 77, 32, 42, 47): 'XPM format',
	(66, 90): 'Bzip',
	(31, 157): 'Compress',
	(31, 139): 'gzip format',
	(80, 75, 3, 4): 'pkzip format',
	(117, 115, 116, 97, 114): 'TAR (POSIX)',
	(77, 90): 'MS-DOS, OS/2 or MS Windows',
	(127, 69, 76, 70): 'Unix elf',
	(153, 0): 'pgp public ring',
	(149, 1): 'pgp security ring',
	(149, 0): 'pgp security ring',
	(166, 0): 'pgp encrypted data'}

	def magic_detect(a_byte_array):
	"""very unefficient very partial magic number detection, but sometimes
	efficiency really does not matter"""
	for pattern, name in magic_to_type.items():
	to_read=len(pattern)
	if len(a_byte_array) < to_read: continue
	if tuple(a_byte_array[:to_read]) == pattern:
	return name

	print(magic_detect(get("https://www.python.org/static/img/python-logo.png").content))
	print(magic_detect(open("/usr/lib/x86_64-linux-gnu/libelf.so.1","rb").read(129)))
	print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100)))
	magic_to_type[( 0x61, 0x0d,0x0d, 0x0a)] = "python 3.9 byte-compiled"
	magic_to_type[( 0x33, 0x0d, 0x0d,0x0a)] = "python 3.6 byte-compiled"
	print(magic_detect(open("__pycache__/get_magic.cpython-39.pyc", "rb").read(100)))
	print(magic_detect(open("__pycache__/get_magic.cpython-36.pyc", "rb").read(100)))
	magic_to_type[tuple(map(ord,"#!"))] = "Shebang unix script"
	print(magic_detect(open("get_magic.py", "rb").read(100)))