Created
August 24, 2022 05:01
-
-
Save Droogy/d8490181415154a95f0a3916f94d4c85 to your computer and use it in GitHub Desktop.
Extract .eml URLs and Attachments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract attachments and URLs (from e-mail body) from Outlook .eml files | |
# | |
# 1. dump all mail samples into folder called Mail/ | |
# 2. create empty folder "Attachments/" | |
# 3. run tool from root of both folders | |
import os | |
import re | |
from independentsoft.msg import Message | |
msg_mail_dir = "Mail/" | |
msg_files = [] | |
email_bodies = """""" | |
# iterate over every file in Mail/ and append filename to list | |
for file in os.listdir(msg_mail_dir): | |
f = os.path.join(file) | |
if os.path.isfile(f"Mail/{f}"): | |
msg_files.append(f) | |
# for every file in our list, add body to email_bodies, dump attachments | |
for file in msg_files: | |
message = Message(f"Mail/{file}") | |
email_bodies += f"{message.body}\n" | |
for i in range(len(message.attachments)): | |
attachment = message.attachments[i] | |
attachment.save(f"Attachments/{attachment.file_name}") | |
# 2 rounds of regex, one to extract re-written URLs | |
m = re.findall("__(htt.{1,2}:\S+)__;\S+>\s", email_bodies, flags=re.M) | |
m2 = re.findall("<(htt.{1,2}:\S+)>", email_bodies, flags=re.M) | |
# write e-mail body text to file | |
with open("email_bodies.txt", "w") as fileHandle: | |
for email in email_bodies: | |
fileHandle.write(email) | |
# dump extracted links to file, skip re-written URLs | |
with open("extracted_links.txt", "w") as fileHandle: | |
for link in m: | |
fileHandle.write(f"{link}\n") | |
for link in m2: | |
if link.startswith("https://urldefense.com"): | |
continue | |
else: | |
fileHandle.write(f"{link}\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment