noahlias/olefile_test.py

## olefile_test.py
import click
import olefile
import os
import imghdr
import struct
import filetype


@click.command()
@click.argument("file_name", type=click.Path(exists=True))
@click.argument("output_directory", type=click.Path())
def extract_images(file_name: str, output_directory: str) -> None:
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    if os.path.isfile(file_name):
        try:
            process_file(file_name, output_directory)
        except Exception as e:
            print(e)
    elif os.path.isdir(file_name):
        for root, dirs, files in os.walk(file_name):
            for file in files:
                file_path = os.path.join(root, file)
                try:
                    process_file(file_path, output_directory)
                except Exception as e:
                    print(e)
    else:
        click.echo("Invalid file or directory.")
        return


def process_file(file_path: str, output_directory: str) -> None:
    if not olefile.isOleFile(file_path):
        file_info = filetype.guess(file_path)
        if file_info is None:
            file_format = "Unknown"
        else:
            file_format = file_info.mime
        click.echo(
            f"Invalid file format: {file_path}. Expected OLE2 structured storage file, found: {file_format}."
        )
        return

    image_formats = {
        "jpg": (b"\xff\xd8", b"\xff\xd9"),
        "tif": (b"\x49\x49\x2A\x00", b"\x00\x00\x00\x00"),
        "jpeg": (b"\xff\xd8", b"\xff\xd9"),
        "bmp": (b"\x42\x4D", None),
        "png": (b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", None),
        "dib": (b"\x42\x4D", None),
    }

    ole = olefile.OleFileIO(file_path)
    file_ext = os.path.splitext(file_path)[1][1:]
    file_stream_dict = {
        "doc": "Data",
        "dot": "Data",
        "dotm": "Data",
        "xlt": "Workbook",
        "xlsm": "Workbook",
        "pps": "Pictures",
        "pptm": "Pictures",
        "wps": "Data",
        "wpt": "Data",
        "dps": "Pictures",
        "dpt": "Pictures",
        "et": "Workbook",
        "ett": "Workbook",
    }

    stream = ole.openstream(file_stream_dict.get(file_ext))
    req_data = stream.read()

    image_count = 0
    for ext, (start_marker, end_marker) in image_formats.items():
        start = 0
        while True:
            start = req_data.find(start_marker, start)
            if start == -1:
                break

            if end_marker is None:
                end = len(req_data)
            else:
                end = req_data.find(end_marker, start) + len(end_marker)

            image_data = req_data[start:end]
            image_count += 1
            if ext in ["bmp", "dib"]:
                image_data = process_bmp_image(image_data)

            output_file = os.path.join(
                output_directory, f"{os.path.basename(file_path)}_{image_count}.{ext}"
            )

            if os.path.exists(output_file):
                start = end
                continue

            with open(output_file, "wb") as f:
                f.write(image_data)

            # Validating the extracted image using imghdr
            if imghdr.what(output_file) != ext or (
                ext in ["bmp", "dib"] and not is_valid_bmp(output_file)
            ):
                os.remove(output_file)
                break

            click.echo(
                f"Extracted {ext.upper()} image {image_count} from {file_path} to {output_file}"
            )

            start = end

    ole.close()


def process_bmp_image(image_data: bytes) -> bytes:
    # Check if the BMP image has a valid header
    if image_data[:2] != b"\x42\x4D":
        return image_data

    # Get the length of the BMP image from the header
    length = struct.unpack("<I", image_data[2:6])[0]
    # Trim the image data based on the length
    image_data = image_data[:length]

    return image_data


def is_valid_bmp(file_path: str) -> bool:
    with open(file_path, "rb") as f:
        f.seek(6)
        size_fields = f.read(4)
    return all(field == b"\x00" * 2 for field in struct.unpack("<I", size_fields))


if __name__ == "__main__":
    extract_images()
	import click
	import olefile
	import os
	import imghdr
	import struct
	import filetype


	@click.command()
	@click.argument("file_name", type=click.Path(exists=True))
	@click.argument("output_directory", type=click.Path())
	def extract_images(file_name: str, output_directory: str) -> None:
	if not os.path.exists(output_directory):
	os.makedirs(output_directory)

	if os.path.isfile(file_name):
	try:
	process_file(file_name, output_directory)
	except Exception as e:
	print(e)
	elif os.path.isdir(file_name):
	for root, dirs, files in os.walk(file_name):
	for file in files:
	file_path = os.path.join(root, file)
	try:
	process_file(file_path, output_directory)
	except Exception as e:
	print(e)
	else:
	click.echo("Invalid file or directory.")
	return


	def process_file(file_path: str, output_directory: str) -> None:
	if not olefile.isOleFile(file_path):
	file_info = filetype.guess(file_path)
	if file_info is None:
	file_format = "Unknown"
	else:
	file_format = file_info.mime
	click.echo(
	f"Invalid file format: {file_path}. Expected OLE2 structured storage file, found: {file_format}."
	)
	return

	image_formats = {
	"jpg": (b"\xff\xd8", b"\xff\xd9"),
	"tif": (b"\x49\x49\x2A\x00", b"\x00\x00\x00\x00"),
	"jpeg": (b"\xff\xd8", b"\xff\xd9"),
	"bmp": (b"\x42\x4D", None),
	"png": (b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", None),
	"dib": (b"\x42\x4D", None),
	}

	ole = olefile.OleFileIO(file_path)
	file_ext = os.path.splitext(file_path)[1][1:]
	file_stream_dict = {
	"doc": "Data",
	"dot": "Data",
	"dotm": "Data",
	"xlt": "Workbook",
	"xlsm": "Workbook",
	"pps": "Pictures",
	"pptm": "Pictures",
	"wps": "Data",
	"wpt": "Data",
	"dps": "Pictures",
	"dpt": "Pictures",
	"et": "Workbook",
	"ett": "Workbook",
	}

	stream = ole.openstream(file_stream_dict.get(file_ext))
	req_data = stream.read()

	image_count = 0
	for ext, (start_marker, end_marker) in image_formats.items():
	start = 0
	while True:
	start = req_data.find(start_marker, start)
	if start == -1:
	break

	if end_marker is None:
	end = len(req_data)
	else:
	end = req_data.find(end_marker, start) + len(end_marker)

	image_data = req_data[start:end]
	image_count += 1
	if ext in ["bmp", "dib"]:
	image_data = process_bmp_image(image_data)

	output_file = os.path.join(
	output_directory, f"{os.path.basename(file_path)}_{image_count}.{ext}"
	)

	if os.path.exists(output_file):
	start = end
	continue

	with open(output_file, "wb") as f:
	f.write(image_data)

	# Validating the extracted image using imghdr
	if imghdr.what(output_file) != ext or (
	ext in ["bmp", "dib"] and not is_valid_bmp(output_file)
	):
	os.remove(output_file)
	break

	click.echo(
	f"Extracted {ext.upper()} image {image_count} from {file_path} to {output_file}"
	)

	start = end

	ole.close()


	def process_bmp_image(image_data: bytes) -> bytes:
	# Check if the BMP image has a valid header
	if image_data[:2] != b"\x42\x4D":
	return image_data

	# Get the length of the BMP image from the header
	length = struct.unpack("<I", image_data[2:6])[0]
	# Trim the image data based on the length
	image_data = image_data[:length]

	return image_data


	def is_valid_bmp(file_path: str) -> bool:
	with open(file_path, "rb") as f:
	f.seek(6)
	size_fields = f.read(4)
	return all(field == b"\x00" * 2 for field in struct.unpack("<I", size_fields))


	if __name__ == "__main__":
	extract_images()