Skip to content

Instantly share code, notes, and snippets.

@abought
Last active January 31, 2023 15:22
Show Gist options
  • Save abought/15a1e08705b121c1b7bd to your computer and use it in GitHub Desktop.
Save abought/15a1e08705b121c1b7bd to your computer and use it in GitHub Desktop.
Extract all email addresses in from/to/cc fields of every msg in one Gmail folder
"""Create a connection to Gmail and do something with the results
References:
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/
and
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/
"""
__author__ = 'abought'
import email
import imaplib
import getpass
import sys
import re
from pprint import pprint as pp
# User may want to change these parameters if running script as-is
SEARCH_FOLDER = '[Gmail]Trash' # TODO: A user will want to change this
# Other folders: "INBOX", "[Gmail]/All Mail"
DEFAULT_MAIL_SERVER = 'imap.gmail.com'
# No user parameters below this line
ADDR_PATTERN = re.compile('<(.*?)>') # Finds email as <nospam@nospam.com>
def connect(user, pwd, server=DEFAULT_MAIL_SERVER):
"""Connect to [the specified] mail server. Return an open connection"""
conn = imaplib.IMAP4_SSL(server)
try:
conn.login(user, pwd)
except imaplib.IMAP4.error:
print "Failed to login"
sys.exit(1)
return conn
def print_folders(conn):
"""Print a list of open mailbox folders"""
for f in conn.list():
print "\t", f
def get_folder(conn, folder_name):
"""Fetch a specific folder (or label) from server"""
if conn.state == "SELECTED":
# Explicitly close any previously opened folders; may not be necessary
conn.close()
rv, data = conn.select(folder_name)
if rv != 'OK':
print "Could not open specified folder. Known labels:"
print_folders(conn)
return conn
def get_email_ids(conn, query='ALL'):
"""Get the numeric IDs for all emails in a given folder"""
if conn.state != "SELECTED":
raise imaplib.IMAP4.error("Cannot search without selecting a folder")
rv, data = conn.uid('search', None, query)
if rv != 'OK':
print "Could not fetch email ids" # for some reason...
return []
return data[0].split()
def fetch_message(conn, msg_uid ):
"""
Fetch a specific message uid (not sequential id!) from the given folder;
return the parsed message. User must ensure that specified
message ID exists in that folder.
"""
# TODO: Could we fetch just the envelope of the response to save bandwidth?
rv, data = conn.uid('fetch', msg_uid, "(RFC822)")
if rv != 'OK':
print "ERROR fetching message #", msg_uid
return {}
return email.message_from_string(data[0][1]) # dict-like object
def get_recipients(msg_parsed):
"""Given a parsed message, extract and return recipient list"""
recipients = []
addr_fields = ['From', 'To', 'Cc', 'Bcc']
for f in addr_fields:
rfield = msg_parsed.get(f, "") # Empty string if field not present
rlist = re.findall(ADDR_PATTERN, rfield)
recipients.extend(rlist)
return recipients
if __name__ == "__main__":
username = raw_input("Full email address: ")
password = getpass.getpass()
# Connect
mail_conn = connect(username, password)
# Open a specific folder and get list of email message uids
mail_conn = get_folder(mail_conn, SEARCH_FOLDER)
msg_uid_list = get_email_ids(mail_conn)
# Fetch a list of recipients
all_recipients = []
for msg_uid in msg_uid_list:
msg = fetch_message(mail_conn, msg_uid)
recip_list = get_recipients(msg)
all_recipients.extend(recip_list)
# Very unsophisticated way of showing the recipient list
print "List of all recipients:"
print "------------"
pp(all_recipients)
print "\n\n List of all UNIQUE recipients:"
print "-------------------------------"
pp(set(all_recipients))
try:
mail_conn.close() # Close currently selected folder (if any)
finally:
mail_conn.logout()
@abought
Copy link
Author

abought commented Mar 24, 2015

Fetches all email messages from one Gmail folder and returns a list of email addresses in the FROM, TO, CC, and BCC fields. Written for Python 2.7; should not require any external dependencies.

Usage

Can be run directly as is (once you specify the desired SEARCH_FOLDER). Will prompt for username and password.

python gistfile1.py

By default it just outputs results to console. The variables SEARCH_FOLDER and DEFAULT_MAIL_SERVER at the top of the file can be changed as appropriate; the default value (Trash) was chosen for testing purposes (my trash folder had just a few messages). Other common sample folder/ label names used by Gmail are provided. This file also exposes functions that can be called from other scripts.

Caveats

By default this is a bit brute-force and has no safeguards for bandwidth usage etc. This was tuned towards the output of GMail responses, and may or may not work perfectly with other IMAP providers.

@mutlusun
Copy link

Hello @abought,
Thanks for this little script! I updated it to use it with python3:

"""Create a connection to Gmail and do something with the results

References:
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/
and
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/
"""
__author__ = 'abought'

import email.parser
import imaplib
import getpass
import sys
import re
import ssl

from pprint import pprint as pp

# User may want to change these parameters if running script as-is

# Search folders, multiple directories can be given
# TODO: A user will want to change this
SEARCH_FOLDER = ['"[Gmail]Trash"', '"[Gmail]/All Mail"', '"INBOX"'] 
DEFAULT_MAIL_SERVER = 'imap.gmail.com'

# No user parameters below this line
ADDR_PATTERN = re.compile("<(.+)>")  # Finds email as <nospam@nospam.com>


def connect(user, pwd, server=DEFAULT_MAIL_SERVER):
    """Connect to [the specified] mail server. Return an open connection"""
    conn = imaplib.IMAP4_SSL(host=server,
            ssl_context=ssl.create_default_context())
    try:
        conn.login(user, pwd)
    except imaplib.IMAP4.error:
        print("Failed to login")
        sys.exit(1)
    return conn


def print_folders(conn):
    """Print a list of open mailbox folders"""
    for f in conn.list():
        for i in f:
            print("\t", i)


def get_mails_from_folder(conn, folder_name):
    """Fetch a specific folder (or label) from server"""
    typ, data = conn.select(mailbox=folder_name, readonly=True)
    if typ != 'OK':
        print("Could not open specified folder. Known labels:")
        print_folders(conn)
        return

    typ, data = conn.search(None, 'ALL')
    if typ != 'OK':
        print("Could not get mail list of folder: ", folder_name)
        return

    return data[0].split()

def fetch_message(conn, msg_uid):
    """
    Fetch a specific message uid (not sequential id!) from the given folder;
    return the parsed message. User must ensure that specified
    message ID exists in that folder.
    """
    # TODO: Could we fetch just the envelope of the response to save bandwidth?
    typ, data = conn.fetch(msg_uid, '(RFC822)')
    if typ != 'OK':
        print("ERROR fetching message #", msg_uid)
        return

    return email.parser.BytesParser().parsebytes(data[0][1], headersonly=True)


def get_recipients(msg):
    """Given a parsed message, extract and return recipient list"""
    recipients = []
    addr_fields = ['From', 'To', 'Cc', 'Bcc', 'Reply-To', 'Sender']

    for f in addr_fields:
        if msg[f] is None:
            continue

        # str conversion is needed for non-ascii chars
        rlist = ADDR_PATTERN.findall(str(msg[f]))
        recipients.extend(rlist)

    return recipients


if __name__ == "__main__":
    username = input("Enter username: ")
    password = input("Enter password: ")

    # Connect
    mail_conn = connect(username, password)

    # show folders of mail account
    #print_folders(mail_conn)

    # Open folders and get list of email message uids
    all_recipients = []
    for folder in SEARCH_FOLDER:
        # switch to folder
        for mail_id in get_mails_from_folder(mail_conn, folder):
            data = fetch_message(mail_conn, mail_id)
            recip_list = get_recipients(data)
            all_recipients.extend(recip_list)

        mail_conn.close()

    mail_conn.logout()

    # Very unsophisticated way of showing the recipient list
    print("List of all recipients:")
    print("------------")
    pp(all_recipients)

    print("\n\n List of all UNIQUE recipients:")
    print("-------------------------------")
    pp(set(all_recipients))

Best

@madeinlisboa
Copy link

Thanks! That's what I was looking for. Is that a way of limiting by date?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment