Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Find Compressed Data without Compression Header
#!/usr/bin/env python3
#
# find-compressed-data.py
#
# A small script to bruteforce embedded compressed data that might not have a header
# Useful for raw binary firmware images that do not contain a standard
# binary header (ELF, PE, MACH-O).
#
# I included a limt on size at 16KB because this has a tendency to create
# lots of small files, which are generally false positives.
#
# I usually run this over every firmware image I need to analyze.
#
# Usage: python find-compressed-data.py "filename.bin"
#
import zlib
import sys
import lzma
import bz2
import zipfile
import threading
import copy
LIMIT = 1024 * 16
def do_bz2(compressed_data):
try:
unzipped = bz2.decompress(compressed_data[i:])
if len(unzipped) > LIMIT:
print ('BZ2: Offset Found', i)
with open('./result-bz2-' + str(i) + '.bin.bz2', 'wb') as result:
result.write(unzipped);
result.close()
except Exception as ex:
pass
def do_lzma(compressed_data):
try:
unzipped = lzma.decompress(compressed_data[i:])
if len(unzipped) > LIMIT:
print ('LZMA: Offset Found', i)
with open('./result-lzma-' + str(i) + '.bin.lzma', 'wb') as result:
result.write(unzipped);
result.close()
except Exception as ex:
pass
def do_zlib(compressed_data):
for i in range(len(compressed_data)):
try:
unzipped = zlib.decompress(compressed_data[i:], -zlib.MAX_WBITS)
if len(unzipped) > LIMIT:
print ('GZIP: Offset found', i)
with open('./result-gz-' + str(i) + '.bin.gz', 'wb') as result:
result.write(unzipped);
result.close()
except Exception as ex:
pass
with open(sys.argv[1], 'rb') as compressed_data:
compressed_data = compressed_data.read()
thread_zlib = threading.Thread(target=do_zlib, args=(copy.copy(compressed_data),))
thread_lzma = threading.Thread(target=do_lzma, args=(copy.copy(compressed_data),))
thread_bz2 = threading.Thread(target=do_bz2, args=(copy.copy(compressed_data),))
thread_zlib.start()
thread_lzma.start()
thread_bz2.start()
@bphd
Copy link

bphd commented Jul 5, 2022

Sure it produces KBs files, but despite those files are named .gz, the header is not rebuilt, so they're identified as many things and not supported by gzip, often not even compressed (it produces for example an HTML that I was able to directly read in browser). So I guess its method of finding compressed bits is actually able to find way more than compressed data. Or I make a mistake and it decompress data and separate them with what he find. But so I don't understand the extension

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment