Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Plotting email attachment data using pandas
import pandas as pd
import numpy as np
import humanfriendly
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os.path
# Read in our email data file
df = pd.read_csv('../attachments2.csv', header = 0)
# Filter out sent mail
notFromMe = df.query('FromEmail != "hartez@gmail.com"')
def filetype(row):
if not(isinstance(row['ContentTypeName'], str)):
return ''
if row['MediaSubtype'] == 'octet-stream':
return os.path.splitext(row['ContentTypeName'])[1]
return row['MediaSubtype']
# Extract type data from the filename where MediaSubtype doesn't have it
notFromMe['FileType'] = notFromMe.apply(lambda row: filetype(row), axis = 1)
# Group everything by mime type
types = notFromMe.groupby(['FileType'])
# and get counts and total sizes for each mime type
types = types.agg({'AttachmentId' : 'count', 'Size' : 'sum'})
# Get total amounts so we can calculate percentages later
totalCount = types['AttachmentId'].sum()
totalSize = types['Size'].sum()
# Get the count and divide by the total so we can get
# the percentage for each mime type
types['percentCount'] = types['AttachmentId'] / totalCount
types['percentSize'] = types['Size'] / totalSize
def combinedPlot(df, col, cutoff):
# Just get the mime types which are 1% or more
overCutoff = df[col]
overCutoff = df.query(col + '>' + str(cutoff))
# Fill in the 'other' section
remaining = 1 - (overCutoff[col].sum())
other = pd.DataFrame({col : pd.Series([remaining], index = ['other'])})
# Add the 'other' section to our main data
both = overCutoff.append(other)
# Plot it
both[col].plot(kind='pie', figsize=(6, 6), title='Attachment Types')
plt.show()
combinedPlot(types, 'percentSize', 0.02)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.