Created
July 30, 2024 18:04
-
-
Save salbahra/577e6ada35d715ea70020e0e3830334e to your computer and use it in GitHub Desktop.
Takes a flat list of EML files and organizes them into a year, month and day folder structure with a more compliant filename based on the subject
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import email | |
from email.utils import parsedate_to_datetime | |
import shutil | |
import re | |
import unicodedata | |
from email.header import decode_header | |
import base64 | |
import quopri | |
def decode_subject(subject): | |
decoded_parts = [] | |
for part, encoding in decode_header(subject): | |
if isinstance(part, bytes): | |
if encoding == 'utf-8': | |
decoded_parts.append(part.decode('utf-8')) | |
elif encoding is None: | |
# Try to decode as base64 if it's not explicitly utf-8 | |
try: | |
decoded_parts.append(base64.b64decode(part).decode('utf-8')) | |
except: | |
# If base64 fails, try quoted-printable | |
try: | |
decoded_parts.append(quopri.decodestring(part).decode('utf-8')) | |
except: | |
# If all else fails, just decode as utf-8 and ignore errors | |
decoded_parts.append(part.decode('utf-8', errors='ignore')) | |
else: | |
decoded_parts.append(part) | |
return ''.join(decoded_parts) | |
def sanitize_filename(filename): | |
filename = decode_subject(filename) | |
# Remove any non-ASCII characters | |
filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode() | |
# Replace spaces | |
filename = filename.strip().replace(' ', '_') | |
# Remove any other potentially problematic characters | |
filename = re.sub(r'[^\w\-_\. ]', '', filename) | |
# Truncate to a reasonable length (e.g., 255 characters) | |
return filename[:255] | |
def organize_eml_files(source_folder, destination_folder): | |
for root, _, files in os.walk(source_folder): | |
for filename in files: | |
if filename.endswith('.eml'): | |
file_path = os.path.join(root, filename) | |
with open(file_path, 'rb') as file: | |
msg = email.message_from_binary_file(file) | |
date_str = msg.get('Date') | |
subject = msg.get('Subject', 'No Subject') | |
if date_str: | |
try: | |
date = parsedate_to_datetime(date_str) | |
year = str(date.year) | |
month = f"{date.month:02d}" | |
day = f"{date.day:02d}" | |
new_folder = os.path.join(destination_folder, year, month, day) | |
os.makedirs(new_folder, exist_ok=True) | |
# Sanitize the filename | |
safe_filename = sanitize_filename(subject) + '.eml' | |
print() | |
new_file_path = os.path.join(new_folder, safe_filename) | |
# Check if a file with this name already exists | |
counter = 1 | |
while os.path.exists(new_file_path): | |
name, ext = os.path.splitext(safe_filename) | |
new_file_path = os.path.join(new_folder, f"{name}_{counter}{ext}") | |
counter += 1 | |
# Move the file | |
shutil.move(file_path, new_file_path) | |
# Update file modification and creation times | |
os.utime(new_file_path, (date.timestamp(), date.timestamp())) | |
print(f"Moved {filename} to {new_file_path}") | |
# Log any filename changes | |
if safe_filename != filename: | |
print(f" Original filename: {filename}") | |
print(f" Sanitized filename: {safe_filename}") | |
except Exception as e: | |
print(f"Error processing {filename}: {str(e)}") | |
else: | |
print(f"No Date header found in {filename}") | |
# Usage | |
source_folder = '/path/to/input' | |
destination_folder = '/path/to/output' | |
organize_eml_files(source_folder, destination_folder) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment