Created
June 5, 2023 16:34
-
-
Save noahlias/9c90aab66e322977b4a02fa17cb6a809 to your computer and use it in GitHub Desktop.
OLE file research.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import click | |
import olefile | |
import os | |
import imghdr | |
import struct | |
import filetype | |
@click.command() | |
@click.argument("file_name", type=click.Path(exists=True)) | |
@click.argument("output_directory", type=click.Path()) | |
def extract_images(file_name: str, output_directory: str) -> None: | |
if not os.path.exists(output_directory): | |
os.makedirs(output_directory) | |
if os.path.isfile(file_name): | |
try: | |
process_file(file_name, output_directory) | |
except Exception as e: | |
print(e) | |
elif os.path.isdir(file_name): | |
for root, dirs, files in os.walk(file_name): | |
for file in files: | |
file_path = os.path.join(root, file) | |
try: | |
process_file(file_path, output_directory) | |
except Exception as e: | |
print(e) | |
else: | |
click.echo("Invalid file or directory.") | |
return | |
def process_file(file_path: str, output_directory: str) -> None: | |
if not olefile.isOleFile(file_path): | |
file_info = filetype.guess(file_path) | |
if file_info is None: | |
file_format = "Unknown" | |
else: | |
file_format = file_info.mime | |
click.echo( | |
f"Invalid file format: {file_path}. Expected OLE2 structured storage file, found: {file_format}." | |
) | |
return | |
image_formats = { | |
"jpg": (b"\xff\xd8", b"\xff\xd9"), | |
"tif": (b"\x49\x49\x2A\x00", b"\x00\x00\x00\x00"), | |
"jpeg": (b"\xff\xd8", b"\xff\xd9"), | |
"bmp": (b"\x42\x4D", None), | |
"png": (b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", None), | |
"dib": (b"\x42\x4D", None), | |
} | |
ole = olefile.OleFileIO(file_path) | |
file_ext = os.path.splitext(file_path)[1][1:] | |
file_stream_dict = { | |
"doc": "Data", | |
"dot": "Data", | |
"dotm": "Data", | |
"xlt": "Workbook", | |
"xlsm": "Workbook", | |
"pps": "Pictures", | |
"pptm": "Pictures", | |
"wps": "Data", | |
"wpt": "Data", | |
"dps": "Pictures", | |
"dpt": "Pictures", | |
"et": "Workbook", | |
"ett": "Workbook", | |
} | |
stream = ole.openstream(file_stream_dict.get(file_ext)) | |
req_data = stream.read() | |
image_count = 0 | |
for ext, (start_marker, end_marker) in image_formats.items(): | |
start = 0 | |
while True: | |
start = req_data.find(start_marker, start) | |
if start == -1: | |
break | |
if end_marker is None: | |
end = len(req_data) | |
else: | |
end = req_data.find(end_marker, start) + len(end_marker) | |
image_data = req_data[start:end] | |
image_count += 1 | |
if ext in ["bmp", "dib"]: | |
image_data = process_bmp_image(image_data) | |
output_file = os.path.join( | |
output_directory, f"{os.path.basename(file_path)}_{image_count}.{ext}" | |
) | |
if os.path.exists(output_file): | |
start = end | |
continue | |
with open(output_file, "wb") as f: | |
f.write(image_data) | |
# Validating the extracted image using imghdr | |
if imghdr.what(output_file) != ext or ( | |
ext in ["bmp", "dib"] and not is_valid_bmp(output_file) | |
): | |
os.remove(output_file) | |
break | |
click.echo( | |
f"Extracted {ext.upper()} image {image_count} from {file_path} to {output_file}" | |
) | |
start = end | |
ole.close() | |
def process_bmp_image(image_data: bytes) -> bytes: | |
# Check if the BMP image has a valid header | |
if image_data[:2] != b"\x42\x4D": | |
return image_data | |
# Get the length of the BMP image from the header | |
length = struct.unpack("<I", image_data[2:6])[0] | |
# Trim the image data based on the length | |
image_data = image_data[:length] | |
return image_data | |
def is_valid_bmp(file_path: str) -> bool: | |
with open(file_path, "rb") as f: | |
f.seek(6) | |
size_fields = f.read(4) | |
return all(field == b"\x00" * 2 for field in struct.unpack("<I", size_fields)) | |
if __name__ == "__main__": | |
extract_images() |
不错,套上 ole 再搜索,比我的纯re打法更稳定一些。但是我的样本里有的不止一张图片。要完美work 少不了再掉几根头发。
ps: click 不错,学到了
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
简单研究了一下这个
OLE
Output
大部分的代码AI辅助,参考了很多OLE文件相关资料等
具体实现原理就是读取stream找到image独特的signature,重写图片