Skip to content

Instantly share code, notes, and snippets.

@itsmejoeeey
Last active December 17, 2023 18:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itsmejoeeey/ad9342204f8aa98a627af1dfd06794c3 to your computer and use it in GitHub Desktop.
Save itsmejoeeey/ad9342204f8aa98a627af1dfd06794c3 to your computer and use it in GitHub Desktop.
Python Gmail Organizer ════ Gives a breakdown of the most common senders in your Gmail inbox. See the full writeup: https://joeeey.com/blog/declutter-gmail-inbox-with-python-guide/
#!/usr/bin/env python3
from __future__ import print_function
import os.path
import pandas as pd
import re
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
def get_creds():
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.json', 'w') as token:
token.write(creds.to_json())
return creds
email_metadata = []
def process_email_metadata(request_id, response, exception):
global email_metadata
message_id = response.get('id')
headers = response.get('payload').get('headers');
if(headers is not None):
for header in headers:
if header['name'] == "From":
username, domain = re.match(
r'(?:.*<)?(.*)@(.*?)(?:>.*|$)', header['value']
).groups()
email_metadata.append({
'message_id':message_id,
'username':username,
'domain':domain})
break
def get_inbox_emails(service):
# Call the Gmail API
response = service.users().messages().list(
userId='me',
labelIds=['INBOX'],
maxResults=5000
).execute()
# Retrieve all message ids
messages = []
messages.extend(response['messages'])
while 'nextPageToken' in response:
page_token = response['nextPageToken']
response = service.users().messages().list(
userId='me',
labelIds=['INBOX'],
maxResults=5000,
pageToken=page_token
).execute()
messages.extend(response['messages'])
# Retrieve the metadata for all messages
step = 100
num_messages = len(messages)
for batch in range(0, num_messages, step):
batch_req = service.new_batch_http_request(callback=process_email_metadata)
for i in range(batch, min(batch + step, num_messages)):
batch_req.add(service.users().messages().get(
userId='me',
id=messages[i]['id'],
format="metadata")
)
batch_req.execute()
def main():
creds = get_creds()
service = build('gmail', 'v1', credentials=creds)
get_inbox_emails(service)
# Print the results
df = pd.DataFrame(email_metadata)
print("Most common email usernames -----------")
print(df.groupby('username')
.size().reset_index(name='count')
.sort_values(by='count',ascending=False)
.to_string(index=False))
print()
print("Most common email domains -------------")
print(df.groupby('domain')
.size().reset_index(name='count')
.sort_values(by='count',ascending=False)
.to_string(index=False))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment