Skip to content

Instantly share code, notes, and snippets.

@noahlias
Created June 5, 2023 16:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save noahlias/9c90aab66e322977b4a02fa17cb6a809 to your computer and use it in GitHub Desktop.
Save noahlias/9c90aab66e322977b4a02fa17cb6a809 to your computer and use it in GitHub Desktop.
OLE file research.
import click
import olefile
import os
import imghdr
import struct
import filetype
@click.command()
@click.argument("file_name", type=click.Path(exists=True))
@click.argument("output_directory", type=click.Path())
def extract_images(file_name: str, output_directory: str) -> None:
if not os.path.exists(output_directory):
os.makedirs(output_directory)
if os.path.isfile(file_name):
try:
process_file(file_name, output_directory)
except Exception as e:
print(e)
elif os.path.isdir(file_name):
for root, dirs, files in os.walk(file_name):
for file in files:
file_path = os.path.join(root, file)
try:
process_file(file_path, output_directory)
except Exception as e:
print(e)
else:
click.echo("Invalid file or directory.")
return
def process_file(file_path: str, output_directory: str) -> None:
if not olefile.isOleFile(file_path):
file_info = filetype.guess(file_path)
if file_info is None:
file_format = "Unknown"
else:
file_format = file_info.mime
click.echo(
f"Invalid file format: {file_path}. Expected OLE2 structured storage file, found: {file_format}."
)
return
image_formats = {
"jpg": (b"\xff\xd8", b"\xff\xd9"),
"tif": (b"\x49\x49\x2A\x00", b"\x00\x00\x00\x00"),
"jpeg": (b"\xff\xd8", b"\xff\xd9"),
"bmp": (b"\x42\x4D", None),
"png": (b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", None),
"dib": (b"\x42\x4D", None),
}
ole = olefile.OleFileIO(file_path)
file_ext = os.path.splitext(file_path)[1][1:]
file_stream_dict = {
"doc": "Data",
"dot": "Data",
"dotm": "Data",
"xlt": "Workbook",
"xlsm": "Workbook",
"pps": "Pictures",
"pptm": "Pictures",
"wps": "Data",
"wpt": "Data",
"dps": "Pictures",
"dpt": "Pictures",
"et": "Workbook",
"ett": "Workbook",
}
stream = ole.openstream(file_stream_dict.get(file_ext))
req_data = stream.read()
image_count = 0
for ext, (start_marker, end_marker) in image_formats.items():
start = 0
while True:
start = req_data.find(start_marker, start)
if start == -1:
break
if end_marker is None:
end = len(req_data)
else:
end = req_data.find(end_marker, start) + len(end_marker)
image_data = req_data[start:end]
image_count += 1
if ext in ["bmp", "dib"]:
image_data = process_bmp_image(image_data)
output_file = os.path.join(
output_directory, f"{os.path.basename(file_path)}_{image_count}.{ext}"
)
if os.path.exists(output_file):
start = end
continue
with open(output_file, "wb") as f:
f.write(image_data)
# Validating the extracted image using imghdr
if imghdr.what(output_file) != ext or (
ext in ["bmp", "dib"] and not is_valid_bmp(output_file)
):
os.remove(output_file)
break
click.echo(
f"Extracted {ext.upper()} image {image_count} from {file_path} to {output_file}"
)
start = end
ole.close()
def process_bmp_image(image_data: bytes) -> bytes:
# Check if the BMP image has a valid header
if image_data[:2] != b"\x42\x4D":
return image_data
# Get the length of the BMP image from the header
length = struct.unpack("<I", image_data[2:6])[0]
# Trim the image data based on the length
image_data = image_data[:length]
return image_data
def is_valid_bmp(file_path: str) -> bool:
with open(file_path, "rb") as f:
f.seek(6)
size_fields = f.read(4)
return all(field == b"\x00" * 2 for field in struct.unpack("<I", size_fields))
if __name__ == "__main__":
extract_images()
@noahlias
Copy link
Author

noahlias commented Jun 5, 2023

简单研究了一下这个OLE

image

Output

image

大部分的代码AI辅助,参考了很多OLE文件相关资料等
具体实现原理就是读取stream找到image独特的signature,重写图片

@albertofwb
Copy link

不错,套上 ole 再搜索,比我的纯re打法更稳定一些。但是我的样本里有的不止一张图片。要完美work 少不了再掉几根头发。

ps: click 不错,学到了

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment