tsumarios/eml_4n6.py

## eml_4n6.py
#!/usr/bin/env python3

"""
Script: eml_4n6.py
Description: Extract attachments and log metadata from EML files.
Author: tsumarios
Date: 21/12/2023
Note: This script is based on https://github.com/diogo-alves/eml-extractor and extends it with logging and other forensics features.

Usage:
    ./eml_4n6.py [OPTIONS]

Options:
    -a, --analyst ANALYST_NAME
        Analyst name for chain of custody information (default: system name).

    -s, --source PATH
        The directory containing the .eml files to extract attachments (default: current working directory).

    -r, --recursive
        Allow recursive search for .eml files under the SOURCE directory.

    -f, --files FILE [FILE ...]
        Specify a .eml file or a list of .eml files to extract attachments.

    -d, --destination PATH
        The directory to extract attachments to (default: current working directory).
"""

import os
import re
import logging
from argparse import ArgumentParser, ArgumentTypeError
from datetime import datetime
import hashlib
from email import message_from_file, policy
from pathlib import Path

# Constants
EML_FILE_EXTENSION = ".eml"
ILLEGAL_CHARS_REGEX = r'[/\\|\[\]\{\}:<>+=;,?!*"~#$%&@\']'

# Logging configuration
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)


def extract_attachments(file: Path, destination: Path, analyst_name: str) -> None:
    """
    Extract attachments from an EML file.

    Args:
        file (Path): Path to the EML file.
        destination (Path): Path to the destination directory for extracted attachments.
        analyst_name (str): Name of the analyst for chain of custody information.
    """
    try:
        logger.info(f'Processing file: "{file}"')

        # Log Chain of Custody information
        logger.info(f"Analyst: {analyst_name}, Date: {datetime.now()}")

        with file.open(errors="ignore") as f:
            email_message = message_from_file(f, policy=policy.default)
            email_subject = email_message.get("Subject")
            basepath = destination / sanitise_foldername(email_subject)

            # Extract and log additional metadata
            email_sender = email_message.get("From")
            email_receiver = email_message.get("To")
            email_date_sent = email_message.get("Date")
            email_date_received = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            # Log metadata
            logger.info(f"Subject: {email_subject}")
            logger.info(f"Sender: {email_sender}")
            logger.info(f"Receiver: {email_receiver}")
            logger.info(f"Date Sent: {email_date_sent}")
            logger.info(f"Date Received: {email_date_received}")

            # Ignore inline attachments
            attachments = [
                item
                for item in email_message.iter_attachments()
                if item.is_attachment()
            ]

            if not attachments:
                logger.info("No attachments found.")
                return

            for attachment in attachments:
                filename = attachment.get_filename()
                logger.info(f"Attachment found: {filename}")
                filepath = basepath / filename

                payload = attachment.get_payload(decode=True)

                if isinstance(payload, str):
                    # Handle the string payload as needed
                    logger.info("Payload is a string:", payload)
                else:
                    # Handle the bytes payload, decoding as UTF-8
                    try:
                        decoded_payload = payload.decode("utf-8", errors="replace")
                        log_forensic_artefacts(decoded_payload)
                    except UnicodeDecodeError as e:
                        logger.error(f"Error decoding payload: {e}")

                if filepath.exists():
                    overwrite = input(
                        f'The file "{filename}" already exists! Overwrite it (Y/N)? '
                    )
                    if overwrite.upper() == "Y":
                        save_attachment(filepath, payload)
                    else:
                        logger.info("Skipping...")
                else:
                    basepath.mkdir(exist_ok=True)
                    save_attachment(filepath, payload)

                # Log hash value
                file_hash = hash_file(filepath)
                logger.info(f"Attachment hash ({filename}): {file_hash}")

    except Exception as e:
        logger.error(f"Error processing file: {file}. Exception: {e}", exc_info=True)


def log_forensic_artefacts(payload: str) -> None:
    """
    Log forensic artifacts found in the payload.

    Args:
        payload (str): Payload to analyse for forensic artifacts.
    """
    # Extract and log URLs
    urls = re.findall(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        payload,
    )
    for url in urls:
        logger.info(f"Found URL: {url}")


def sanitise_foldername(name: str) -> str:
    """
    Sanitise a folder name by replacing illegal characters.

    Args:
        name (str): Folder name to sanitise.

    Returns:
        str: Sanitised folder name.
    """
    return re.sub(ILLEGAL_CHARS_REGEX, "_", name)


def save_attachment(file: Path, payload: bytes) -> None:
    """
    Save an attachment to a file.

    Args:
        file (Path): Path to save the attachment.
        payload (bytes): Attachment payload to save.
    """
    with file.open("wb") as f:
        logger.info(f'Saving attachment to "{file}"')
        f.write(payload)


def hash_file(file_path: Path) -> str:
    """
    Calculate the SHA-256 hash of a file.

    Args:
        file_path (Path): Path to the file.

    Returns:
        str: SHA-256 hash of the file.
    """
    hasher = hashlib.sha256()
    with file_path.open("rb") as file:
        while chunk := file.read(8192):
            hasher.update(chunk)
    return hasher.hexdigest()


def get_eml_files_from(path: Path, recursively: bool = False) -> list:
    """
    Get a list of EML files from a directory.

    Args:
        path (Path): Directory path.
        recursively (bool, optional): Whether to search recursively. Defaults to False.

    Returns:
        list: List of EML files.
    """
    if recursively:
        return list(path.rglob("*.eml"))
    return list(path.glob("*.eml"))


def check_file(arg_value: str) -> Path:
    """
    Check if the argument is a valid EML file.

    Args:
        arg_value (str): Argument value.

    Returns:
        Path: Valid EML file path.

    Raises:
        ArgumentTypeError: If the argument is not a valid EML file.
    """
    file = Path(arg_value)
    if file.is_file() and file.suffix == ".eml":
        return file
    raise ArgumentTypeError(f'"{file}" is not a valid EML file.')


def check_path(arg_value: str) -> Path:
    """
    Check if the argument is a valid directory.

    Args:
        arg_value (str): Argument value.

    Returns:
        Path: Valid directory path.

    Raises:
        ArgumentTypeError: If the argument is not a valid directory.
    """
    path = Path(arg_value)
    if path.is_dir():
        return path
    raise ArgumentTypeError(f'"{path}" is not a valid directory.')


def parse_arguments():
    """
    Parse command-line arguments.

    Returns:
        Namespace: Parsed arguments.
    """
    parser = ArgumentParser(
        usage="%(prog)s [OPTIONS]", description="Extracts attachments from .eml files"
    )
    # Include analyst argument with default value
    parser.add_argument(
        "-a",
        "--analyst",
        type=str,
        default=os.getenv("COMPUTERNAME") or os.getenv("HOSTNAME") or "DefaultAnalyst",
        metavar="ANALYST_NAME",
        help="Analyst name for chain of custody information (default: system name)",
    )
    # force the use of --source or --files, not both
    source_group = parser.add_mutually_exclusive_group()
    source_group.add_argument(
        "-s",
        "--source",
        type=check_path,
        default=Path.cwd(),
        metavar="PATH",
        help="the directory containing the .eml files to extract attachments (default: current working directory)",
    )
    parser.add_argument(
        "-r",
        "--recursive",
        action="store_true",
        help="allow recursive search for .eml files under SOURCE directory",
    )
    source_group.add_argument(
        "-f",
        "--files",
        nargs="+",
        type=check_file,
        metavar="FILE",
        help="specify a .eml file or a list of .eml files to extract attachments",
    )
    parser.add_argument(
        "-d",
        "--destination",
        type=check_path,
        default=Path.cwd(),
        metavar="PATH",
        help="the directory to extract attachments to (default: current working directory)",
    )
    return parser.parse_args()


def main():
    """
    Main function to execute the script.
    """
    args = parse_arguments()

    eml_files = args.files or get_eml_files_from(args.source, args.recursive)
    if not eml_files:
        logger.info("No EML files found!")

    for file in eml_files:
        try:
            extract_attachments(
                file, destination=args.destination, analyst_name=args.analyst
            )
        except Exception as e:
            logger.error(
                f"Error processing file: {file}. Exception: {e}", exc_info=True
            )
            continue
    logger.info("Done.")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	"""
	Script: eml_4n6.py
	Description: Extract attachments and log metadata from EML files.
	Author: tsumarios
	Date: 21/12/2023
	Note: This script is based on https://github.com/diogo-alves/eml-extractor and extends it with logging and other forensics features.

	Usage:
	./eml_4n6.py [OPTIONS]

	Options:
	-a, --analyst ANALYST_NAME
	Analyst name for chain of custody information (default: system name).

	-s, --source PATH
	The directory containing the .eml files to extract attachments (default: current working directory).

	-r, --recursive
	Allow recursive search for .eml files under the SOURCE directory.

	-f, --files FILE [FILE ...]
	Specify a .eml file or a list of .eml files to extract attachments.

	-d, --destination PATH
	The directory to extract attachments to (default: current working directory).
	"""

	import os
	import re
	import logging
	from argparse import ArgumentParser, ArgumentTypeError
	from datetime import datetime
	import hashlib
	from email import message_from_file, policy
	from pathlib import Path

	# Constants
	EML_FILE_EXTENSION = ".eml"
	ILLEGAL_CHARS_REGEX = r'[/\\\|\[\]\{\}:<>+=;,?!*"~#$%&@\']'

	# Logging configuration
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
	)
	logger = logging.getLogger(__name__)


	def extract_attachments(file: Path, destination: Path, analyst_name: str) -> None:
	"""
	Extract attachments from an EML file.

	Args:
	file (Path): Path to the EML file.
	destination (Path): Path to the destination directory for extracted attachments.
	analyst_name (str): Name of the analyst for chain of custody information.
	"""
	try:
	logger.info(f'Processing file: "{file}"')

	# Log Chain of Custody information
	logger.info(f"Analyst: {analyst_name}, Date: {datetime.now()}")

	with file.open(errors="ignore") as f:
	email_message = message_from_file(f, policy=policy.default)
	email_subject = email_message.get("Subject")
	basepath = destination / sanitise_foldername(email_subject)

	# Extract and log additional metadata
	email_sender = email_message.get("From")
	email_receiver = email_message.get("To")
	email_date_sent = email_message.get("Date")
	email_date_received = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	# Log metadata
	logger.info(f"Subject: {email_subject}")
	logger.info(f"Sender: {email_sender}")
	logger.info(f"Receiver: {email_receiver}")
	logger.info(f"Date Sent: {email_date_sent}")
	logger.info(f"Date Received: {email_date_received}")

	# Ignore inline attachments
	attachments = [
	item
	for item in email_message.iter_attachments()
	if item.is_attachment()
	]

	if not attachments:
	logger.info("No attachments found.")
	return

	for attachment in attachments:
	filename = attachment.get_filename()
	logger.info(f"Attachment found: {filename}")
	filepath = basepath / filename

	payload = attachment.get_payload(decode=True)

	if isinstance(payload, str):
	# Handle the string payload as needed
	logger.info("Payload is a string:", payload)
	else:
	# Handle the bytes payload, decoding as UTF-8
	try:
	decoded_payload = payload.decode("utf-8", errors="replace")
	log_forensic_artefacts(decoded_payload)
	except UnicodeDecodeError as e:
	logger.error(f"Error decoding payload: {e}")

	if filepath.exists():
	overwrite = input(
	f'The file "{filename}" already exists! Overwrite it (Y/N)? '
	)
	if overwrite.upper() == "Y":
	save_attachment(filepath, payload)
	else:
	logger.info("Skipping...")
	else:
	basepath.mkdir(exist_ok=True)
	save_attachment(filepath, payload)

	# Log hash value
	file_hash = hash_file(filepath)
	logger.info(f"Attachment hash ({filename}): {file_hash}")

	except Exception as e:
	logger.error(f"Error processing file: {file}. Exception: {e}", exc_info=True)


	def log_forensic_artefacts(payload: str) -> None:
	"""
	Log forensic artifacts found in the payload.

	Args:
	payload (str): Payload to analyse for forensic artifacts.
	"""
	# Extract and log URLs
	urls = re.findall(
	r"http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
	payload,
	)
	for url in urls:
	logger.info(f"Found URL: {url}")


	def sanitise_foldername(name: str) -> str:
	"""
	Sanitise a folder name by replacing illegal characters.

	Args:
	name (str): Folder name to sanitise.

	Returns:
	str: Sanitised folder name.
	"""
	return re.sub(ILLEGAL_CHARS_REGEX, "_", name)


	def save_attachment(file: Path, payload: bytes) -> None:
	"""
	Save an attachment to a file.

	Args:
	file (Path): Path to save the attachment.
	payload (bytes): Attachment payload to save.
	"""
	with file.open("wb") as f:
	logger.info(f'Saving attachment to "{file}"')
	f.write(payload)


	def hash_file(file_path: Path) -> str:
	"""
	Calculate the SHA-256 hash of a file.

	Args:
	file_path (Path): Path to the file.

	Returns:
	str: SHA-256 hash of the file.
	"""
	hasher = hashlib.sha256()
	with file_path.open("rb") as file:
	while chunk := file.read(8192):
	hasher.update(chunk)
	return hasher.hexdigest()


	def get_eml_files_from(path: Path, recursively: bool = False) -> list:
	"""
	Get a list of EML files from a directory.

	Args:
	path (Path): Directory path.
	recursively (bool, optional): Whether to search recursively. Defaults to False.

	Returns:
	list: List of EML files.
	"""
	if recursively:
	return list(path.rglob("*.eml"))
	return list(path.glob("*.eml"))


	def check_file(arg_value: str) -> Path:
	"""
	Check if the argument is a valid EML file.

	Args:
	arg_value (str): Argument value.

	Returns:
	Path: Valid EML file path.

	Raises:
	ArgumentTypeError: If the argument is not a valid EML file.
	"""
	file = Path(arg_value)
	if file.is_file() and file.suffix == ".eml":
	return file
	raise ArgumentTypeError(f'"{file}" is not a valid EML file.')


	def check_path(arg_value: str) -> Path:
	"""
	Check if the argument is a valid directory.

	Args:
	arg_value (str): Argument value.

	Returns:
	Path: Valid directory path.

	Raises:
	ArgumentTypeError: If the argument is not a valid directory.
	"""
	path = Path(arg_value)
	if path.is_dir():
	return path
	raise ArgumentTypeError(f'"{path}" is not a valid directory.')


	def parse_arguments():
	"""
	Parse command-line arguments.

	Returns:
	Namespace: Parsed arguments.
	"""
	parser = ArgumentParser(
	usage="%(prog)s [OPTIONS]", description="Extracts attachments from .eml files"
	)
	# Include analyst argument with default value
	parser.add_argument(
	"-a",
	"--analyst",
	type=str,
	default=os.getenv("COMPUTERNAME") or os.getenv("HOSTNAME") or "DefaultAnalyst",
	metavar="ANALYST_NAME",
	help="Analyst name for chain of custody information (default: system name)",
	)
	# force the use of --source or --files, not both
	source_group = parser.add_mutually_exclusive_group()
	source_group.add_argument(
	"-s",
	"--source",
	type=check_path,
	default=Path.cwd(),
	metavar="PATH",
	help="the directory containing the .eml files to extract attachments (default: current working directory)",
	)
	parser.add_argument(
	"-r",
	"--recursive",
	action="store_true",
	help="allow recursive search for .eml files under SOURCE directory",
	)
	source_group.add_argument(
	"-f",
	"--files",
	nargs="+",
	type=check_file,
	metavar="FILE",
	help="specify a .eml file or a list of .eml files to extract attachments",
	)
	parser.add_argument(
	"-d",
	"--destination",
	type=check_path,
	default=Path.cwd(),
	metavar="PATH",
	help="the directory to extract attachments to (default: current working directory)",
	)
	return parser.parse_args()


	def main():
	"""
	Main function to execute the script.
	"""
	args = parse_arguments()

	eml_files = args.files or get_eml_files_from(args.source, args.recursive)
	if not eml_files:
	logger.info("No EML files found!")

	for file in eml_files:
	try:
	extract_attachments(
	file, destination=args.destination, analyst_name=args.analyst
	)
	except Exception as e:
	logger.error(
	f"Error processing file: {file}. Exception: {e}", exc_info=True
	)
	continue
	logger.info("Done.")


	if __name__ == "__main__":
	main()