Skip to content

Instantly share code, notes, and snippets.

@gbolmier
Last active September 6, 2019 22:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gbolmier/b6a942699aaaedec54041a32e4f34d40 to your computer and use it in GitHub Desktop.
Save gbolmier/b6a942699aaaedec54041a32e4f34d40 to your computer and use it in GitHub Desktop.
Code ran to build trec07p CSV available in `creme` library
import email
import warnings
import pandas as pd
from bs4 import BeautifulSoup
def parse_body(message):
"""Parses text body from email message object with BeautifulSoup.
Parameters:
message (email.Message object): Loaded email with Python standard library email module.
Returns:
body (str): Email text body.
"""
body = ''
if message.is_multipart():
for part in message.walk():
content_type = part.get_content_type()
content_disposition = str(part.get('Content-Disposition'))
if (content_type in ['text/html', 'text/txt'] and
'attachment' not in content_disposition):
body = part.get_payload(decode=True)
break
else:
body = message.get_payload(decode=True)
return BeautifulSoup(body, 'html5lib').get_text()
def stream_trec07p(dataset_path):
"""2007 TREC’s Spam Track dataset.
The data contains 75,419 chronologically ordered items, i.e. 3 months of emails delivered
to a particular server in 2007. Spam messages represent 66.6% of the dataset.
The goal is to predict whether an email is a spam or not.
Parsed features are: sender, recipients, date, subject, body.
Parameters:
dataset_path (str): The directory where the data is stored.
Yields:
tuple: 5 features (`sender`, `recipients`, `date`, `subject`, `body`) and `y` the target.
References:
1. `TREC 2007 Spam Track Overview <https://trec.nist.gov/pubs/trec16/papers/SPAM.OVERVIEW16.pdf>`_
"""
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')
with open(f'{dataset_path}/full/index') as full_index:
for row in full_index:
label, filepath = row.split()
ix = filepath.split('.')[-1]
with open(f'{dataset_path}/data/inmail.{ix}', 'rb') as email_file:
message = email.message_from_binary_file(email_file)
yield (
message['from'],
message['to'],
message['date'],
message['subject'],
parse_body(message),
label
)
# Parse emails and save into CSV
dataset_path = '/home/gbolmier/Downloads/trec07p'
parsed_emails = [parsed_email for parsed_email in stream_trec07p(dataset_path)]
columns = ['sender', 'recipients', 'date', 'subject', 'body', 'y']
df = pd.DataFrame(parsed_emails, columns=columns)
df.to_csv('/home/gbolmier/Downloads/trec07p.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment