Skip to content

Instantly share code, notes, and snippets.

@mvrozanti
Created January 5, 2019 04:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mvrozanti/0f0589b9006a37e1fe6f0be914d64426 to your computer and use it in GitHub Desktop.
Save mvrozanti/0f0589b9006a37e1fe6f0be914d64426 to your computer and use it in GitHub Desktop.
Finding images in a binary file w/ python
#!/usr/bin/env python2
# original author: jason moiron
# http://jmoiron.net/blog/finding-images-binary-file-w-python/
chunk = 1048576 * 4
# http://www.obrador.com/essentialjpeg/headerinfo.htm
jfif_id = 'JFIF\x00'
diffie_quant_marker = '\xff\xdb'
diffie_huffman_marker = '\xff\xc4'
frame_marker = '\xff\xc0'
scan_marker = '\xff\xda'
comment_marker = '\xff\xee'
end_of_image = eoi = '\xff\xd9'
soi = '\xff\xd8\xff\xe0'
def extra_check(string):
"""An extra check to make sure we're looking at a jpeg file..."""
return jfif_id in string[:11]
def slice_image(img):
"""Find the EOI marker assuming we are at the beginning of a jpeg file."""
dqm_loc = img.find(diffie_quant_marker)
dhm_loc = img.find(diffie_huffman_marker, dqm_loc)
frm_loc = img.find(frame_marker, dhm_loc)
smk_loc = img.find(scan_marker, frm_loc)
com_loc = img.find(comment_marker, smk_loc)
eoi_loc = img.find(end_of_image, com_loc)
return img[:eoi_loc+2]
def generate_jpeg_files(f):
"""A generator that spits out strings that match jpeg files. `f` is a
file opened in binary mode."""
eof = False
s = ''
while not eof:
while soi not in s:
s = f.read(chunk)
if not s:
eof = True
break
img_loc = s.find(soi)
img = s[img_loc:]
if len(img) < 11:
extra = f.read(chunk)
img += extra
s += extra
if not extra_check(img):
# hmm.. it wasn't a jpeg after all, continue
s = s[img_loc+1:]
continue
image = slice_image(img)
s = s[img_loc + len(image):]
yield image
def find_all_images(filename, threshold=None):
f = open(filename, 'rb')
image_generator = generate_jpeg_files(f)
for num,img in enumerate(image_generator):
ifile = open('potential_image_%04d.jpg' % num, 'wb')
ifile.write(img)
ifile.close()
if threshold and num > threshold:
break
f.close()
if __name__ == '__main__':
import optparse
parser = optparse.OptionParser(usage='%prog [opts] filename', version='1.0')
parser.add_option('-t', '--threshold', help='maximum number of image files to extract')
opts, args = parser.parse_args()
threshold = int(opts.threshold) if opts.threshold else None
find_all_images(args[0], threshold)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment