Skip to content

Instantly share code, notes, and snippets.

@Droogy
Created August 24, 2022 05:01
Show Gist options
  • Save Droogy/d8490181415154a95f0a3916f94d4c85 to your computer and use it in GitHub Desktop.
Save Droogy/d8490181415154a95f0a3916f94d4c85 to your computer and use it in GitHub Desktop.
Extract .eml URLs and Attachments
# extract attachments and URLs (from e-mail body) from Outlook .eml files
#
# 1. dump all mail samples into folder called Mail/
# 2. create empty folder "Attachments/"
# 3. run tool from root of both folders
import os
import re
from independentsoft.msg import Message
msg_mail_dir = "Mail/"
msg_files = []
email_bodies = """"""
# iterate over every file in Mail/ and append filename to list
for file in os.listdir(msg_mail_dir):
f = os.path.join(file)
if os.path.isfile(f"Mail/{f}"):
msg_files.append(f)
# for every file in our list, add body to email_bodies, dump attachments
for file in msg_files:
message = Message(f"Mail/{file}")
email_bodies += f"{message.body}\n"
for i in range(len(message.attachments)):
attachment = message.attachments[i]
attachment.save(f"Attachments/{attachment.file_name}")
# 2 rounds of regex, one to extract re-written URLs
m = re.findall("__(htt.{1,2}:\S+)__;\S+>\s", email_bodies, flags=re.M)
m2 = re.findall("<(htt.{1,2}:\S+)>", email_bodies, flags=re.M)
# write e-mail body text to file
with open("email_bodies.txt", "w") as fileHandle:
for email in email_bodies:
fileHandle.write(email)
# dump extracted links to file, skip re-written URLs
with open("extracted_links.txt", "w") as fileHandle:
for link in m:
fileHandle.write(f"{link}\n")
for link in m2:
if link.startswith("https://urldefense.com"):
continue
else:
fileHandle.write(f"{link}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment