salbahra/organize-eml.py

## organize-eml.py
import os
import email
from email.utils import parsedate_to_datetime
import shutil
import re
import unicodedata
from email.header import decode_header
import base64
import quopri

def decode_subject(subject):
    decoded_parts = []
    for part, encoding in decode_header(subject):
        if isinstance(part, bytes):
            if encoding == 'utf-8':
                decoded_parts.append(part.decode('utf-8'))
            elif encoding is None:
                # Try to decode as base64 if it's not explicitly utf-8
                try:
                    decoded_parts.append(base64.b64decode(part).decode('utf-8'))
                except:
                    # If base64 fails, try quoted-printable
                    try:
                        decoded_parts.append(quopri.decodestring(part).decode('utf-8'))
                    except:
                        # If all else fails, just decode as utf-8 and ignore errors
                        decoded_parts.append(part.decode('utf-8', errors='ignore'))
        else:
            decoded_parts.append(part)
    return ''.join(decoded_parts)

def sanitize_filename(filename):
    filename = decode_subject(filename)
    # Remove any non-ASCII characters
    filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
    # Replace spaces
    filename = filename.strip().replace(' ', '_')
    # Remove any other potentially problematic characters
    filename = re.sub(r'[^\w\-_\. ]', '', filename)
    # Truncate to a reasonable length (e.g., 255 characters)
    return filename[:255]

def organize_eml_files(source_folder, destination_folder):
    for root, _, files in os.walk(source_folder):
        for filename in files:
            if filename.endswith('.eml'):
                file_path = os.path.join(root, filename)

                with open(file_path, 'rb') as file:
                    msg = email.message_from_binary_file(file)

                date_str = msg.get('Date')
                subject = msg.get('Subject', 'No Subject')

                if date_str:
                    try:
                        date = parsedate_to_datetime(date_str)
                        year = str(date.year)
                        month = f"{date.month:02d}"
                        day = f"{date.day:02d}"

                        new_folder = os.path.join(destination_folder, year, month, day)
                        os.makedirs(new_folder, exist_ok=True)

                        # Sanitize the filename
                        safe_filename = sanitize_filename(subject) + '.eml'
                        print()
                        new_file_path = os.path.join(new_folder, safe_filename)

                        # Check if a file with this name already exists
                        counter = 1
                        while os.path.exists(new_file_path):
                            name, ext = os.path.splitext(safe_filename)
                            new_file_path = os.path.join(new_folder, f"{name}_{counter}{ext}")
                            counter += 1

                        # Move the file
                        shutil.move(file_path, new_file_path)

                        # Update file modification and creation times
                        os.utime(new_file_path, (date.timestamp(), date.timestamp()))

                        print(f"Moved {filename} to {new_file_path}")

                        # Log any filename changes
                        if safe_filename != filename:
                            print(f"  Original filename: {filename}")
                            print(f"  Sanitized filename: {safe_filename}")

                    except Exception as e:
                        print(f"Error processing {filename}: {str(e)}")
                else:
                    print(f"No Date header found in {filename}")

# Usage
source_folder = '/path/to/input'
destination_folder = '/path/to/output'
organize_eml_files(source_folder, destination_folder)
	import os
	import email
	from email.utils import parsedate_to_datetime
	import shutil
	import re
	import unicodedata
	from email.header import decode_header
	import base64
	import quopri

	def decode_subject(subject):
	decoded_parts = []
	for part, encoding in decode_header(subject):
	if isinstance(part, bytes):
	if encoding == 'utf-8':
	decoded_parts.append(part.decode('utf-8'))
	elif encoding is None:
	# Try to decode as base64 if it's not explicitly utf-8
	try:
	decoded_parts.append(base64.b64decode(part).decode('utf-8'))
	except:
	# If base64 fails, try quoted-printable
	try:
	decoded_parts.append(quopri.decodestring(part).decode('utf-8'))
	except:
	# If all else fails, just decode as utf-8 and ignore errors
	decoded_parts.append(part.decode('utf-8', errors='ignore'))
	else:
	decoded_parts.append(part)
	return ''.join(decoded_parts)

	def sanitize_filename(filename):
	filename = decode_subject(filename)
	# Remove any non-ASCII characters
	filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
	# Replace spaces
	filename = filename.strip().replace(' ', '_')
	# Remove any other potentially problematic characters
	filename = re.sub(r'[^\w\-_\. ]', '', filename)
	# Truncate to a reasonable length (e.g., 255 characters)
	return filename[:255]

	def organize_eml_files(source_folder, destination_folder):
	for root, _, files in os.walk(source_folder):
	for filename in files:
	if filename.endswith('.eml'):
	file_path = os.path.join(root, filename)

	with open(file_path, 'rb') as file:
	msg = email.message_from_binary_file(file)

	date_str = msg.get('Date')
	subject = msg.get('Subject', 'No Subject')

	if date_str:
	try:
	date = parsedate_to_datetime(date_str)
	year = str(date.year)
	month = f"{date.month:02d}"
	day = f"{date.day:02d}"

	new_folder = os.path.join(destination_folder, year, month, day)
	os.makedirs(new_folder, exist_ok=True)

	# Sanitize the filename
	safe_filename = sanitize_filename(subject) + '.eml'
	print()
	new_file_path = os.path.join(new_folder, safe_filename)

	# Check if a file with this name already exists
	counter = 1
	while os.path.exists(new_file_path):
	name, ext = os.path.splitext(safe_filename)
	new_file_path = os.path.join(new_folder, f"{name}_{counter}{ext}")
	counter += 1

	# Move the file
	shutil.move(file_path, new_file_path)

	# Update file modification and creation times
	os.utime(new_file_path, (date.timestamp(), date.timestamp()))

	print(f"Moved {filename} to {new_file_path}")

	# Log any filename changes
	if safe_filename != filename:
	print(f" Original filename: {filename}")
	print(f" Sanitized filename: {safe_filename}")

	except Exception as e:
	print(f"Error processing {filename}: {str(e)}")
	else:
	print(f"No Date header found in {filename}")

	# Usage
	source_folder = '/path/to/input'
	destination_folder = '/path/to/output'
	organize_eml_files(source_folder, destination_folder)