mvrozanti/gist:0f0589b9006a37e1fe6f0be914d64426

## gistfile1.txt
#!/usr/bin/env python2
# original author: jason moiron
# http://jmoiron.net/blog/finding-images-binary-file-w-python/

chunk = 1048576 * 4
# http://www.obrador.com/essentialjpeg/headerinfo.htm
jfif_id = 'JFIF\x00'
diffie_quant_marker = '\xff\xdb'
diffie_huffman_marker = '\xff\xc4'
frame_marker = '\xff\xc0'
scan_marker = '\xff\xda'
comment_marker = '\xff\xee'
end_of_image = eoi = '\xff\xd9'
soi = '\xff\xd8\xff\xe0'

def extra_check(string):
    """An extra check to make sure we're looking at a jpeg file..."""
    return jfif_id in string[:11]

def slice_image(img):
    """Find the EOI marker assuming we are at the beginning of a jpeg file."""
    dqm_loc = img.find(diffie_quant_marker)
    dhm_loc = img.find(diffie_huffman_marker, dqm_loc)
    frm_loc = img.find(frame_marker, dhm_loc)
    smk_loc = img.find(scan_marker, frm_loc)
    com_loc = img.find(comment_marker, smk_loc)
    eoi_loc = img.find(end_of_image, com_loc)
    return img[:eoi_loc+2]

def generate_jpeg_files(f):
    """A generator that spits out strings that match jpeg files.  `f` is a
    file opened in binary mode."""
    eof = False
    s = ''
    while not eof:
        while soi not in s:
            s = f.read(chunk)
            if not s:
                eof = True
                break
        img_loc = s.find(soi)
        img = s[img_loc:]
        if len(img) < 11:
            extra = f.read(chunk)
            img += extra
            s += extra
        if not extra_check(img):
            # hmm.. it wasn't a jpeg after all, continue
            s = s[img_loc+1:]
            continue
        image = slice_image(img)
        s = s[img_loc + len(image):]
        yield image

def find_all_images(filename, threshold=None):
    f = open(filename, 'rb')
    image_generator = generate_jpeg_files(f)
    for num,img in enumerate(image_generator):
        ifile = open('potential_image_%04d.jpg' % num, 'wb')
        ifile.write(img)
        ifile.close()
        if threshold and num > threshold:
            break
    f.close()

if __name__ == '__main__':
    import optparse
    parser = optparse.OptionParser(usage='%prog [opts] filename', version='1.0')
    parser.add_option('-t', '--threshold', help='maximum number of image files to extract')
    opts, args = parser.parse_args()
    threshold = int(opts.threshold) if opts.threshold else None
    find_all_images(args[0], threshold)
	#!/usr/bin/env python2
	# original author: jason moiron
	# http://jmoiron.net/blog/finding-images-binary-file-w-python/

	chunk = 1048576 * 4
	# http://www.obrador.com/essentialjpeg/headerinfo.htm
	jfif_id = 'JFIF\x00'
	diffie_quant_marker = '\xff\xdb'
	diffie_huffman_marker = '\xff\xc4'
	frame_marker = '\xff\xc0'
	scan_marker = '\xff\xda'
	comment_marker = '\xff\xee'
	end_of_image = eoi = '\xff\xd9'
	soi = '\xff\xd8\xff\xe0'

	def extra_check(string):
	"""An extra check to make sure we're looking at a jpeg file..."""
	return jfif_id in string[:11]

	def slice_image(img):
	"""Find the EOI marker assuming we are at the beginning of a jpeg file."""
	dqm_loc = img.find(diffie_quant_marker)
	dhm_loc = img.find(diffie_huffman_marker, dqm_loc)
	frm_loc = img.find(frame_marker, dhm_loc)
	smk_loc = img.find(scan_marker, frm_loc)
	com_loc = img.find(comment_marker, smk_loc)
	eoi_loc = img.find(end_of_image, com_loc)
	return img[:eoi_loc+2]

	def generate_jpeg_files(f):
	"""A generator that spits out strings that match jpeg files. `f` is a
	file opened in binary mode."""
	eof = False
	s = ''
	while not eof:
	while soi not in s:
	s = f.read(chunk)
	if not s:
	eof = True
	break
	img_loc = s.find(soi)
	img = s[img_loc:]
	if len(img) < 11:
	extra = f.read(chunk)
	img += extra
	s += extra
	if not extra_check(img):
	# hmm.. it wasn't a jpeg after all, continue
	s = s[img_loc+1:]
	continue
	image = slice_image(img)
	s = s[img_loc + len(image):]
	yield image

	def find_all_images(filename, threshold=None):
	f = open(filename, 'rb')
	image_generator = generate_jpeg_files(f)
	for num,img in enumerate(image_generator):
	ifile = open('potential_image_%04d.jpg' % num, 'wb')
	ifile.write(img)
	ifile.close()
	if threshold and num > threshold:
	break
	f.close()

	if __name__ == '__main__':
	import optparse
	parser = optparse.OptionParser(usage='%prog [opts] filename', version='1.0')
	parser.add_option('-t', '--threshold', help='maximum number of image files to extract')
	opts, args = parser.parse_args()
	threshold = int(opts.threshold) if opts.threshold else None
	find_all_images(args[0], threshold)