Skip to content

Instantly share code, notes, and snippets.

@hartez
Created October 4, 2015 19:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hartez/b5c1ab4f1a2da662b7d5 to your computer and use it in GitHub Desktop.
Save hartez/b5c1ab4f1a2da662b7d5 to your computer and use it in GitHub Desktop.
Plotting email attachment data using pandas
import pandas as pd
import numpy as np
import humanfriendly
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os.path
# Read in our email data file
df = pd.read_csv('../attachments2.csv', header = 0)
# Filter out sent mail
notFromMe = df.query('FromEmail != "hartez@gmail.com"')
def filetype(row):
if not(isinstance(row['ContentTypeName'], str)):
return ''
if row['MediaSubtype'] == 'octet-stream':
return os.path.splitext(row['ContentTypeName'])[1]
return row['MediaSubtype']
# Extract type data from the filename where MediaSubtype doesn't have it
notFromMe['FileType'] = notFromMe.apply(lambda row: filetype(row), axis = 1)
# Group everything by mime type
types = notFromMe.groupby(['FileType'])
# and get counts and total sizes for each mime type
types = types.agg({'AttachmentId' : 'count', 'Size' : 'sum'})
# Get total amounts so we can calculate percentages later
totalCount = types['AttachmentId'].sum()
totalSize = types['Size'].sum()
# Get the count and divide by the total so we can get
# the percentage for each mime type
types['percentCount'] = types['AttachmentId'] / totalCount
types['percentSize'] = types['Size'] / totalSize
def combinedPlot(df, col, cutoff):
# Just get the mime types which are 1% or more
overCutoff = df[col]
overCutoff = df.query(col + '>' + str(cutoff))
# Fill in the 'other' section
remaining = 1 - (overCutoff[col].sum())
other = pd.DataFrame({col : pd.Series([remaining], index = ['other'])})
# Add the 'other' section to our main data
both = overCutoff.append(other)
# Plot it
both[col].plot(kind='pie', figsize=(6, 6), title='Attachment Types')
plt.show()
combinedPlot(types, 'percentSize', 0.02)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment