Skip to content

Instantly share code, notes, and snippets.

@salbahra
Created July 30, 2024 18:04
Show Gist options
  • Save salbahra/577e6ada35d715ea70020e0e3830334e to your computer and use it in GitHub Desktop.
Save salbahra/577e6ada35d715ea70020e0e3830334e to your computer and use it in GitHub Desktop.
Takes a flat list of EML files and organizes them into a year, month and day folder structure with a more compliant filename based on the subject
import os
import email
from email.utils import parsedate_to_datetime
import shutil
import re
import unicodedata
from email.header import decode_header
import base64
import quopri
def decode_subject(subject):
decoded_parts = []
for part, encoding in decode_header(subject):
if isinstance(part, bytes):
if encoding == 'utf-8':
decoded_parts.append(part.decode('utf-8'))
elif encoding is None:
# Try to decode as base64 if it's not explicitly utf-8
try:
decoded_parts.append(base64.b64decode(part).decode('utf-8'))
except:
# If base64 fails, try quoted-printable
try:
decoded_parts.append(quopri.decodestring(part).decode('utf-8'))
except:
# If all else fails, just decode as utf-8 and ignore errors
decoded_parts.append(part.decode('utf-8', errors='ignore'))
else:
decoded_parts.append(part)
return ''.join(decoded_parts)
def sanitize_filename(filename):
filename = decode_subject(filename)
# Remove any non-ASCII characters
filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
# Replace spaces
filename = filename.strip().replace(' ', '_')
# Remove any other potentially problematic characters
filename = re.sub(r'[^\w\-_\. ]', '', filename)
# Truncate to a reasonable length (e.g., 255 characters)
return filename[:255]
def organize_eml_files(source_folder, destination_folder):
for root, _, files in os.walk(source_folder):
for filename in files:
if filename.endswith('.eml'):
file_path = os.path.join(root, filename)
with open(file_path, 'rb') as file:
msg = email.message_from_binary_file(file)
date_str = msg.get('Date')
subject = msg.get('Subject', 'No Subject')
if date_str:
try:
date = parsedate_to_datetime(date_str)
year = str(date.year)
month = f"{date.month:02d}"
day = f"{date.day:02d}"
new_folder = os.path.join(destination_folder, year, month, day)
os.makedirs(new_folder, exist_ok=True)
# Sanitize the filename
safe_filename = sanitize_filename(subject) + '.eml'
print()
new_file_path = os.path.join(new_folder, safe_filename)
# Check if a file with this name already exists
counter = 1
while os.path.exists(new_file_path):
name, ext = os.path.splitext(safe_filename)
new_file_path = os.path.join(new_folder, f"{name}_{counter}{ext}")
counter += 1
# Move the file
shutil.move(file_path, new_file_path)
# Update file modification and creation times
os.utime(new_file_path, (date.timestamp(), date.timestamp()))
print(f"Moved {filename} to {new_file_path}")
# Log any filename changes
if safe_filename != filename:
print(f" Original filename: {filename}")
print(f" Sanitized filename: {safe_filename}")
except Exception as e:
print(f"Error processing {filename}: {str(e)}")
else:
print(f"No Date header found in {filename}")
# Usage
source_folder = '/path/to/input'
destination_folder = '/path/to/output'
organize_eml_files(source_folder, destination_folder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment