Skip to content

Instantly share code, notes, and snippets.

@hartez
Created October 4, 2015 19:21
Embed
What would you like to do?
Plotting email attachment data using pandas
import pandas as pd
import numpy as np
import humanfriendly
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os.path
# Read in our email data file
df = pd.read_csv('../attachments2.csv', header = 0)
# Filter out sent mail
notFromMe = df.query('FromEmail != "hartez@gmail.com"')
def filetype(row):
if not(isinstance(row['ContentTypeName'], str)):
return ''
if row['MediaSubtype'] == 'octet-stream':
return os.path.splitext(row['ContentTypeName'])[1]
return row['MediaSubtype']
# Extract type data from the filename where MediaSubtype doesn't have it
notFromMe['FileType'] = notFromMe.apply(lambda row: filetype(row), axis = 1)
# Group everything by mime type
types = notFromMe.groupby(['FileType'])
# and get counts and total sizes for each mime type
types = types.agg({'AttachmentId' : 'count', 'Size' : 'sum'})
# Get total amounts so we can calculate percentages later
totalCount = types['AttachmentId'].sum()
totalSize = types['Size'].sum()
# Get the count and divide by the total so we can get
# the percentage for each mime type
types['percentCount'] = types['AttachmentId'] / totalCount
types['percentSize'] = types['Size'] / totalSize
def combinedPlot(df, col, cutoff):
# Just get the mime types which are 1% or more
overCutoff = df[col]
overCutoff = df.query(col + '>' + str(cutoff))
# Fill in the 'other' section
remaining = 1 - (overCutoff[col].sum())
other = pd.DataFrame({col : pd.Series([remaining], index = ['other'])})
# Add the 'other' section to our main data
both = overCutoff.append(other)
# Plot it
both[col].plot(kind='pie', figsize=(6, 6), title='Attachment Types')
plt.show()
combinedPlot(types, 'percentSize', 0.02)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment