Skip to content

Instantly share code, notes, and snippets.

@skoppula
Last active November 3, 2017 04:37
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save skoppula/05b2c6dabc3d701fe527 to your computer and use it in GitHub Desktop.
A short project to run analytics on emails on a couple mailing lists
import json
import string
import pytz
import operator
from datetime import datetime
from nltk.corpus import stopwords
from collections import defaultdict
def sort_dict(dictionary):
return sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True)
# EXTRACT DATA FROM .EMAILS FILES
LIST_OF_EMAILS = ['therearenosecrets@mit.edu', 'bentry@mit.edu', 'b-entry@mit.edu', 'letsshitoncourtney@mit.edu']
TOTAL_BENTRY_POPULATION = 36
emails = {}
for email in LIST_OF_EMAILS:
path = email.split('@')[0] + '.json'
with open(path) as f:
lines = f.readlines()
threads = map(lambda line: json.loads(line[:-1]), lines)
emails[email] = threads
# Start the analytics!
analytics_dump = {}
print '\nWhich B Entry alias is most popular?'
email_alias_counts = {}
for email, threads in emails.iteritems():
num_messages = reduce(lambda x,y: x + len(y), threads, 0)
num_threads = len(threads)
email_alias_counts[email] = [num_threads, num_messages]
analytics_dump['email_alias_counts'] = email_alias_counts
print '\t',email,'has got',num_threads,'threads and',num_messages,'messages'
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
reverse = dict((value, key) for key, value in enums.iteritems())
enums['reverse_mapping'] = reverse
return type('Enum', (), enums)
print '\nWhat time is the busiest?'
time_counts = [0]*24
day_counts = [0]*7
#Monday=0, Tuesday=1, ..., Sunday=6
month_counts = [0]*12
Semester = enum(FALL=0,IAP=1,SPRING=2,SUMMER=3)
semesters = {Semester.FALL:0, Semester.IAP:0,Semester.SPRING:0,Semester.SUMMER:0}
all_semester_totals = {2013: semesters.copy(), 2014:semesters.copy()}
five_am_email = None
def get_semester(month, day):
if(month == 0):
return Semester.IAP
elif(1 <= month <= 4):
return Semester.SPRING
elif(5 <= month <= 6 or month == 7 and day < 26):
return Semester.SUMMER
else:
return Semester.FALL
def get_time_info(timestamp, get_datetime_obj=False):
date = datetime.fromtimestamp(int(timestamp)/1000.0, tz=pytz.timezone('US/Eastern'))
year = date.year
time = date.time()
weekday = date.weekday()
month = date.month - 1 # Convert month number to list index
day = date.day
return (year, month, day, weekday, time)
def get_school_year(year, month, day):
semester = get_semester(month, day)
if semester != Semester.FALL: year -= 1
return year
semester_totals_2 = [0]*4
encountered_messages = set()
for email, threads in emails.iteritems():
for thread in threads:
for message in thread:
if str(message) in encountered_messages:
continue
else: encountered_messages.add(str(message))
year, month, day, weekday, time = get_time_info(message['date'])
time_counts[time.hour] += 1
day_counts[weekday] += 1
month_counts[month] += 1
semester = get_semester(month, day)
all_semester_totals[get_school_year(year, month, day)][semester] += 1
if time.hour == 5:
five_am_email = message
total_13_14 = sum(all_semester_totals[2013].values())
total_14_15 = sum(all_semester_totals[2014].values())
total = total_13_14 + total_14_15
print '\tEmail frequency across hours in the day',
print '\t\t', time_counts, sum(time_counts)
print '\tEmail frequency across the days of the week',
print '\t\t', day_counts, sum(day_counts)
print '\tEmail frequency across the months',
print '\t\t', month_counts, sum(month_counts)
print '\tEmail frequency by semester',
print '\t\t', all_semester_totals
print '\tTotal for 2013-2014:', total_13_14
print '\tTotal for 2014-2015:', total_14_15
print
print '\tAnd the only email ever sent between the hours of 5AM and 6AM in the last two years\n\n',five_am_email,'\n'
analytics_dump['time_counts'] = time_counts
analytics_dump['day_counts'] = day_counts
analytics_dump['month_counts'] = month_counts
analytics_dump['all_semester_totals'] = all_semester_totals
analytics_dump['total_13_14'] = total_13_14
analytics_dump['total_14_15'] = total_14_15
analytics_dump['total'] = total
# Analysis of sender email frequencies
ENTRY = {
# A KEY-VALUE STORE MAPPING A NAME TO A SET OF EMAILS USED BY INDIVIDUAL
# OMMITED FOR PRIVACY REASONS
}
# Extracted from last two years email senders, listed above
ENTRY_2013 = [# LIST OF NAMES; OMITTED FOR PRIVACY REASONS]
ENTRY_2014 = list(set(ENTRY.keys())-set(ENTRY_2013))
NON_ENTRY = {
# A KEY-VALUE STORE MAPPING A NAME TO A SET OF EMAILS USED BY INDIVIDUAL
# OMMITED FOR PRIVACY REASONS
}
COMBINE = ENTRY.copy()
COMBINE.update(NON_ENTRY)
def find_sender_by_sendfield(query_email):
for person, email_ids in COMBINE.iteritems():
for email in email_ids:
if email in query_email.lower():
return person
print 'Could not find person', query_email
return None
def find_sender_by_name(query_name):
for person, email_ids in COMBINE.iteritems():
names = person.split()
for name in names:
if name == "": continue
name = name.lower()
if name in query_name or query_name in name:
return person
print 'Could not find person', query_name
return None
emails_by_sender = defaultdict(list)
for email, threads in emails.iteritems():
for thread in threads:
for message in thread:
emails_by_sender[find_sender_by_sendfield(message['sender'])].append(message)
sender_count = {}
for sender, sender_emails in emails_by_sender.iteritems():
sender_count[sender] = len(sender_emails)
sender_count_by_year = {2013: defaultdict(int), 2014: defaultdict(int)}
for sender, sender_emails in emails_by_sender.iteritems():
for email in sender_emails:
year, month, day, weekday, time = get_time_info(email['date'])
sender_count_by_year[get_school_year(year, month, day)][sender] += 1
print '\nSenders sorted by total number of emails'
sorted_senders = sort_dict(sender_count)
print '\t',sorted_senders
print '\nSenders every year, sorted by total number of emails'
sorted_senders_2013 = sort_dict(sender_count_by_year[2013])
sorted_senders_2014 = sort_dict(sender_count_by_year[2014])
print '\t2013:'
print '\t',sorted_senders_2013
print '\t2014:'
print '\t',sorted_senders_2014
total_unique_senders_2013 = len(sorted_senders_2013)
total_unique_senders_2014 = len(sorted_senders_2014)
percent_never_sent_entry_email = 3.0/TOTAL_BENTRY_POPULATION
print '\nThe total number of unique senders in 2013 was', total_unique_senders_2013
print 'The total number of unique senders in 2014 was', total_unique_senders_2014
print 'In 2014-2015,', percent_never_sent_entry_email, '% of the entry has never sent an email'
analytics_dump['total_unique_senders_2013'] = total_unique_senders_2013
analytics_dump['total_unique_senders_2014'] = total_unique_senders_2014
analytics_dump['percent_never_sent_entry_email'] = percent_never_sent_entry_email
analytics_dump['sorted_senders'] = sorted_senders
analytics_dump['sorted_senders_2013'] = sorted_senders_2013
analytics_dump['sorted_senders_2014'] = sorted_senders_2014
GRTs_2013 = 'kim'
GRTs_2014 = ['cooper', 'christine']
total_grt_traffic_2013 = len(emails_by_sender[GRTs_2013])
total_grt_traffic_2014 = len(emails_by_sender[GRTs_2014[0]]) + len(emails_by_sender[GRTs_2014[1]])
total_grt_traffic = total_grt_traffic_2013 + total_grt_traffic_2013
percent_traffic_GRT = total_grt_traffic*100.0/total
percent_traffic_GRTs_2013 = total_grt_traffic_2013*100.0/total_13_14
percent_traffic_GRTs_2014 = total_grt_traffic_2014*100.0/total_14_15
analytics_dump['percent_traffic_GRT'] = percent_traffic_GRT
analytics_dump['percent_traffic_GRTs_2013'] = percent_traffic_GRTs_2013
analytics_dump['percent_traffic_GRTs_2014'] = percent_traffic_GRTs_2014
print '\nGRTs make up', percent_traffic_GRT, "% of traffic."
print 'In 2013-2014, GRTs contributed to', percent_traffic_GRTs_2013, "% of traffic."
print 'And in 2014-2015, GRTs contributed to', percent_traffic_GRTs_2014, "% of traffic."
n = 5
top_n_senders_total = 0
for i in xrange(n):
top_n_senders_total += sorted_senders[i][1]
top_n_senders_percent = top_n_senders_total*100.0/total
top_n_senders_population_percent = n*100.0/(len(ENTRY)+len(NON_ENTRY))
print '\nThe top',n, 'senders make up', top_n_senders_percent, '% of all traffic, but only', top_n_senders_population_percent, '% of all senders'
analytics_dump['top_n_senders_percent'] = top_n_senders_percent
analytics_dump['top_n_senders_population_percent'] = top_n_senders_population_percent
analytics_dump['n'] = n
max_thread = []
for email, threads in emails.iteritems():
for thread in threads:
max_thread = thread if len(thread) > len(max_thread) else max_thread
print '\nThe thread with the most number of participants has length', len(max_thread), 'and has subject', max_thread[0]['subject'], 'and first message:',max_thread[0]['body'][0:50],'...'
analytics_dump['len_max_thread'] = len(max_thread)
analytics_dump['max_thread_starter'] = max_thread[0]['body'][0:50]
analytics_dump['max_thread_sender'] = find_sender_by_sendfield(max_thread[0]['sender'])
all_subject_words = defaultdict(int)
all_body_words = defaultdict(int)
all_words = defaultdict(int)
all_names_subject = defaultdict(int)
FORBIDDEN_SNIPPETS = ['mitedu', 'gmail', 'http', 'att000', 'image001', 'jpg', 'png']
BORING_WORDS = set(stopwords.words('english')) | set(['re', 'fw', 'fwd', 'im', 'm', 'youre', '1', 'f', 'd', 'cc', 'pm', 'wrote', 'subject', 'sent', '2014', '30'])
NAMES = [] # LIST OF NAMES OMITTED FOR PRIVACY REASONS]
MONTHS = ['oct', 'may', 'january', 'october', 'jan', 'feb', 'sep']
SIGNATURES = ['massachusetts', 'technology', 'institute', 'class']
def fix_word(word):
return word.lower().strip().encode('ascii','ignore').translate(string.maketrans("",""), string.punctuation)
def should_filter(fixed_word, filter_names=True, filter_months=True, filter_number = True, filter_signatures = True):
if reduce(lambda x,y: x or y in fixed_word, FORBIDDEN_SNIPPETS, False): return True
if fixed_word[0:3] == 'cid' and len(fixed_word) > 20: return True
if fixed_word in BORING_WORDS: return True
if filter_names and fixed_word in NAMES: return True
if filter_months and fixed_word in MONTHS: return True
if filter_signatures and fixed_word in SIGNATURES: return True
if not fixed_word: return True
if filter_number:
try:
int(fixed_word)
return True
except:
return False
else:
return False
def is_name(word):
return word in (set(NAMES) & set(COMBINE.keys()))
for email, threads in emails.iteritems():
for thread in threads:
for message in thread:
body_words = message['body'].split()
subject_words = message['subject'].split()
for word in body_words:
fixed_word = fix_word(word)
if should_filter(fixed_word): continue
all_body_words[fixed_word] += 1
all_words[fixed_word] += 1
for word in subject_words:
fixed_word = fix_word(word)
if is_name(fixed_word): all_names_subject[fixed_word] += 1
if should_filter(fixed_word): continue
all_subject_words[fixed_word] += 1
all_words[fixed_word] += 1
print '\n','100 most frequenct words in body lines'
sorted_body_words = sort_dict(all_body_words)
print '\t',sorted_body_words[:100]
print '\n','100 most frequenct words in subject lines'
sorted_subject_words = sort_dict(all_subject_words)
print '\t',sorted_subject_words[:100]
print '\n','100 most frequenct names in subject'
sorted_subject_names = sort_dict(all_names_subject)
print '\t',sorted_subject_names
analytics_dump['sorted_body_words'] = sorted_body_words
analytics_dump['sorted_subject_words'] = sorted_subject_words
analytics_dump['sorted_subject_names'] = sorted_subject_words
profanity_file = open('google_bad_words.json','r')
profanity_line = profanity_file.read()
bad_words = set(json.loads(profanity_line).keys())
used_bad_words = set(all_words.keys()) & bad_words
print '\nWe\'ve used these instances of profanity in our emails'
for word in used_bad_words:
print '\t',word, 'used',all_words[word],'times'
profane_people = defaultdict(list)
for email, threads in emails.iteritems():
for thread in threads:
for message in thread:
body_words = message['body'].split()
for word in body_words:
fixed_word = fix_word(word)
if fixed_word in bad_words:
profane_people[find_sender_by_sendfield(message['sender'])].append(fixed_word)
profanity_count = {}
for person, profanity_list in profane_people.iteritems():
profanity_count[person] = len(profanity_list)
sorted_profanity_count = sort_dict(profanity_count)
sorted_profanity_people_words = []
print '\nThe profanity used distributed across each person:'
for person, count in sorted_profanity_count:
sorted_profanity_people_words.append((person, profane_people[person]))
print '\t',person, 'used', profane_people[person]
print
analytics_dump['sorted_profanity_people_words'] = sorted_profanity_people_words
analytics_file_path = 'results.json'
analytics_file = open(analytics_file_path, 'w')
analytics_file.write(json.dumps(analytics_dump))
analytics_file.close()
# Uses the Gmail API (https://developers.google.com/gmail/api/)
# to collect emails from specified email addresses, parse, and save
# them as a series of files, one per email, as a list of
# JSON-formatted message threads
import httplib2
import time
import os
import base64
import json
import datetime
from apiclient import errors
from apiclient import discovery
import oauth2client
from oauth2client import client
from oauth2client import tools
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
print 'FLAGS', flags
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Skanda\'s Data Playground'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
home_dir = os.path.expanduser('~')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir, 'gmail-quickstart.json')
store = oauth2client.file.Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatability with Python 2.6
credentials = tools.run(flow, store)
print 'Storing credentials to ' + credential_path
return credentials
def analyze_part(part, headers, unix_date, emails):
if 'data' in part['body']:
b64_body = part['body']['data'].encode('ascii','ignore')
else:
print '\t\tDiscarding because no data', part
return
body = base64.b64decode(b64_body, '-_')
valid_receiver = None
for header in headers:
if header['name'] == 'From':
sender = header['value'].encode('ascii','ignore')
elif header['name'] == 'To':
receiver = header['value'].encode('ascii','ignore')
for email in emails:
if email in receiver.lower():
valid_receiver = email
elif header['name'].lower() == 'cc':
receiver = header['value'].encode('ascii','ignore')
for email in emails:
if email in receiver.lower():
valid_receiver = email
elif header['name'] == 'Subject':
subject = header['value'].encode('ascii','ignore')
if valid_receiver:
seconds = unix_date/1000.0
real_date = datetime.datetime.fromtimestamp(seconds).strftime('%Y-%m-%d %H:%M:%S.%f')
print '\t\t', 'Saving message from ', real_date, 'with subject ', '\"' + subject + '\"'
message = {'date':unix_date, 'sender':sender.strip(), 'receiver':valid_receiver.strip(),
'subject':subject.strip(), 'body':body.strip().replace('\n',' ').replace('\r',' ')}
return message
else:
print '\t\tDiscarding because no target emails match', receiver.lower()
def get_thread(service, thread_id, emails):
raw_thread = service.users().threads().get(userId='me', id=thread_id).execute()
raw_messages = raw_thread['messages']
time.sleep(1)
print '\tGetting thread with ID', thread_id, 'and', len(raw_messages), 'messages'
messages = []
for raw_message in raw_messages:
print '\t\tAnalyzing a message...'
unix_date = int(raw_message['internalDate'])
payload = raw_message['payload']
parts = []
parts.append(payload)
for part in parts:
if 'parts' in part.keys():
parts.extend(part['parts'])
if 'body' in part.keys() and part['mimeType'] == 'text/plain':
message = analyze_part(part, payload['headers'], unix_date, emails)
if message:
messages.append(message)
break
return messages
def write_thread_to_file(my_thread, file):
json_string = json.dumps(my_thread)
file.write(json_string + '\n')
def recordEmailThreadBunch(service, email, emails):
print 'Getting email threads from', email
filename = email.split('@')[0] + '.json'
file = open(filename, 'w')
query = 'to:' + email
threads_on_page = service.users().threads().list(userId='me', q=query).execute()
try:
if 'threads' in threads_on_page:
for thread_snippet in threads_on_page['threads']:
thread = get_thread(service, thread_snippet['id'], emails)
write_thread_to_file(thread, file)
while 'nextPageToken' in threads_on_page:
print '\tTurning page for email',email
page_token = threads_on_page['nextPageToken']
threads_on_page = service.users().threads().list(userId='me', pageToken=page_token, q=query).execute()
for thread_snippet in threads_on_page['threads']:
thread = get_thread(service, thread_snippet['id'], emails)
write_thread_to_file(thread, file)
except errors.HttpError, error:
print 'An error occurred: %s' % error
print 'Closing write file for', email
file.close()
def set_up_credentials():
print 'Setting up credentials...'
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('gmail', 'v1', http=http)
print 'Finished setting up credentials...'
return service
def main():
service = set_up_credentials()
emails = ['therearenosecrets@mit.edu', 'letsshitoncourtney@mit.edu', 'b-entry@mit.edu', 'bentry@mit.edu']
for email in emails:
recordEmailThreadBunch(service, email, emails)
if __name__ == '__main__':
main()
@skoppula
Copy link
Author

skoppula commented Nov 3, 2017

I lived in a group that has roughly four mailing lists that we use to communicate. I was interested to see a few statistics on the emails: most frequent senders, most frequented mailing lists, more profane senders, and more. So I spent a weekend to collect all the emails sent to these mailing lists over two years, and crunch a few numbers over these emails.

What was collected?

I took a look at email activity on four different mailing lists over the last two years.

What was analyzed? What are some results?

For privacy reasons, I can’t disclose all the email results of our living group, but here are some interesting tidbits:

  • Out of over a thousand emails, only one was ever sent in the hours of 5 a.m. to 6 a.m.
  • 11pm is the most busy time of day in our living group for email traffic
  • The most prolific sender has over 200+ emails over the course of two years
  • The top five senders make up more than 40% of traffic
  • The longest email thread is 36 emails
  • Our living group’s most common subject-line words: study, break, anyone, lounge, dinner, super, 11pm, pancakes, cake, and food
  • Our living group’s most common email-body words: MIT, like, know, super, want, please, hey, guys
  • In two years, only two people have every used the word ‘butt’. About half of the living group has used a profane word in an email at some point in the last two years.

Feel free to browse through the scripts used to collect and analyze the data. I was planning on adding some visualizations to the statistics, but I was wrapping up my internship around the same time, and it quickly became crunch time at my work before I got around to topping the analysis off with graphs. Please enjoy.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment