Skip to content

Instantly share code, notes, and snippets.

@tbbooher
Created April 26, 2023 14:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tbbooher/aaad12a58395903158d6d95721a431f6 to your computer and use it in GitHub Desktop.
Save tbbooher/aaad12a58395903158d6d95721a431f6 to your computer and use it in GitHub Desktop.
from __future__ import print_function
import base64
import os
import os.path
import sys
import email
import datetime
import pytz
import base64
import email
from weasyprint import HTML
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import weasyprint
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
def get_email_body(msg):
if 'payload' in msg:
payload = msg['payload']
if 'parts' in payload:
parts = payload['parts']
for part in parts:
if part['mimeType'] == 'text/html':
body_data = part['body']['data']
break
else:
body_data = payload['body']['data']
else:
body_data = msg['raw'].decode('utf-8')
body_data = base64.urlsafe_b64decode(body_data.encode('UTF-8')).decode('UTF-8')
soup = BeautifulSoup(body_data, 'html.parser')
return soup.prettify()
def get_subject(msg):
if 'payload' in msg:
headers = msg['payload']['headers']
else:
headers = msg['headers']
for header in headers:
if header['name'].lower() == 'subject':
return header['value']
return None
def get_received_time(msg):
"""
Given a message, extract the received time from the headers
and return it as a datetime object with timezone info
"""
# Get the "internal date" field from the message (in UTC)
internal_date_str = msg.get('internalDate')
if internal_date_str is None:
raise ValueError("Message has no internalDate field")
internal_date = datetime.datetime.fromtimestamp(int(internal_date_str) / 1000, tz=datetime.timezone.utc)
# Get the timezone offset from the raw message data
headers = msg.get('payload', {}).get('headers', []) or msg.get('headers', [])
for header in headers:
if header['name'].lower() == 'date':
date_str = header['value']
try:
date = datetime.datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S %z')
except ValueError:
date = datetime.datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S %z (UTC)')
return date.astimezone(internal_date.tzinfo)
# If no date header was found, just return the internal date
return internal_date
def main():
creds = None
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
with open('token.json', 'w') as token:
token.write(creds.to_json())
try:
service = build('gmail', 'v1', credentials=creds)
query = 'from:"Uber Receipts"'
results = service.users().messages().list(userId='me', q=query, maxResults=10).execute()
messages = results.get('messages', [])
if not messages:
print('No messages found.')
return
for message in messages:
msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
subject = get_subject(msg)
received_time = get_received_time(msg)
file_name = f"{subject}_{received_time.strftime('%Y%m%d_%H%M')}.pdf"
body = get_email_body(msg)
if body:
soup = BeautifulSoup(body, 'html.parser')
html = str(soup)
with open(file_name, 'wb') as f:
HTML(string=html).write_pdf(f)
print(f'Saved email {message["id"]} as {file_name}')
else:
print(f'Skipped email {message["id"]} because it has no HTML body')
except HttpError as error:
print(f'An error occurred: {error}')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment