Skip to content

Instantly share code, notes, and snippets.

@urschrei
Last active March 25, 2024 02:20
Show Gist options
  • Star 83 You must be signed in to star a gist
  • Fork 29 You must be signed in to fork a gist
  • Save urschrei/5258588 to your computer and use it in GitHub Desktop.
Save urschrei/5258588 to your computer and use it in GitHub Desktop.
Extract attachments from EML files in the current dir, and write them to the output subdir
#!/usr/bin/env python
"""
2020 update:
- More iterators, fewer lists
- Python 3 compatible
- Processes files in parallel
(one thread per CPU, but that's not really how it works)
"""
import glob
import os
import email
from email import policy
from multiprocessing import Pool
EXTENSION = "eml"
def extract(filename):
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
od = "output"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue
# If no attachments are found, skip this file
if output_filename:
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
if __name__ == "__main__":
# let's do this in parallel, using cpu count as number of threads
pool = Pool(None)
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
# need these if we use _async
pool.close()
pool.join()
# 2-element list holding number of files, number of attachments
numfiles = [sum(i) for i in zip(*res)]
print("Done: Processed {} files with {} attachments.".format(*numfiles))
@HUrquhart
Copy link

Thankyou so much this helped me process all those realestate emails and get the rent receipts thankyou

@maldunate
Copy link

It fails when there are files with the same name and ext. for this I added the date of the email eg. "15 Apr 2021".

if output_filename:
with open(os.path.join(od, str(msg['date'])[5:16] + " " + output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)

@dharley-gaggle
Copy link

EXTREMELY useful, much thanks!

@jonathanyaod3
Copy link

@urschrei
Many thanks.
But I find this code cannot extract .msg files which are from outlook app. When I attach one eml and one msg, only eml file is saved.

@kucster
Copy link

kucster commented Apr 24, 2023

Worked great! Thank you for your hard work

@dagelf
Copy link

dagelf commented Oct 3, 2023

Didn't work for me for embedded eml inside eml, with pdfs in, this did:
(Note that it only seeks out pdf files)

#!/usr/bin/python3
import os
import sys
import email
from email import policy
from email.parser import BytesParser
from email.iterators import typed_subpart_iterator

def extract_attachments(email_message, output_folder):
    for part in typed_subpart_iterator(email_message, 'application', 'pdf'):
        filename = part.get_filename()
        if not filename:
            continue
        filepath = os.path.join(output_folder, filename)
        with open(filepath, 'wb') as f:
            f.write(part.get_payload(decode=True))

def parse_email(file_path, output_folder):
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
        if msg.is_multipart():
            for payload in msg.iter_parts():
                if payload.get_content_type() == 'message/rfc822':
                    extract_attachments(payload.get_payload(0), output_folder)
                elif payload.get_content_type() == 'application/pdf':
                    extract_attachments(msg, output_folder)
        else:
            extract_attachments(msg, output_folder)

if __name__ == "__main__":
    file_path = sys.argv[1]
    output_folder = sys.argv[2]
    os.makedirs(output_folder, exist_ok=True)
    parse_email(file_path, output_folder)

First argument is the .eml and second is where you want the files extracted.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment